60 files changed, 4610 insertions, 2610 deletions
diff --git a/compiler/basicTypes/UniqSupply.lhs b/compiler/basicTypes/UniqSupply.lhs
index bb40be7ac1..f3fb28ac21 100644
--- a/compiler/basicTypes/UniqSupply.lhs
+++ b/compiler/basicTypes/UniqSupply.lhs
@@ -109,7 +109,7 @@ takeUniqFromSupply (MkSplitUniqSupply n s1 _) = (mkUniqueGrimily (iBox n), s1)
 
 \begin{code}
 -- | A monad which just gives the ability to obtain 'Unique's
-newtype UniqSM result = USM { unUSM :: UniqSupply -> (result, UniqSupply) }
+newtype UniqSM result = USM { unUSM :: UniqSupply -> (# result, UniqSupply #) }
 
 instance Monad UniqSM where
   return = returnUs
@@ -118,21 +118,21 @@ instance Monad UniqSM where
 
 instance Functor UniqSM where
     fmap f (USM x) = USM (\us -> case x us of
-                                 (r, us') -> (f r, us'))
+                                 (# r, us' #) -> (# f r, us' #))
 
 instance Applicative UniqSM where
     pure = returnUs
     (USM f) <*> (USM x) = USM $ \us -> case f us of
-                            (ff, us')  -> case x us' of
-                              (xx, us'') -> (ff xx, us'')
+                            (# ff, us' #)  -> case x us' of
+                              (# xx, us'' #) -> (# ff xx, us'' #)
 
 -- | Run the 'UniqSM' action, returning the final 'UniqSupply'
 initUs :: UniqSupply -> UniqSM a -> (a, UniqSupply)
-initUs init_us m = case unUSM m init_us of { (r,us) -> (r,us) }
+initUs init_us m = case unUSM m init_us of { (# r, us #) -> (r,us) }
 
 -- | Run the 'UniqSM' action, discarding the final 'UniqSupply'
 initUs_ :: UniqSupply -> UniqSM a -> a
-initUs_ init_us m = case unUSM m init_us of { (r, _) -> r }
+initUs_ init_us m = case unUSM m init_us of { (# r, _ #) -> r }
 
 {-# INLINE thenUs #-}
 {-# INLINE lazyThenUs #-}
@@ -142,27 +142,30 @@ initUs_ init_us m = case unUSM m init_us of { (r, _) -> r }
 
 @thenUs@ is where we split the @UniqSupply@.
 \begin{code}
+liftUSM :: UniqSM a -> UniqSupply -> (a, UniqSupply)
+liftUSM (USM m) us = case m us of (# a, us' #) -> (a, us')
+
 instance MonadFix UniqSM where
-    mfix m = USM (\us -> let (r,us') = unUSM (m r) us in (r,us'))
+    mfix m = USM (\us -> let (r,us') = liftUSM (m r) us in (# r,us' #))
 
 thenUs :: UniqSM a -> (a -> UniqSM b) -> UniqSM b
 thenUs (USM expr) cont
   = USM (\us -> case (expr us) of
-                   (result, us') -> unUSM (cont result) us')
+                   (# result, us' #) -> unUSM (cont result) us')
 
 lazyThenUs :: UniqSM a -> (a -> UniqSM b) -> UniqSM b
-lazyThenUs (USM expr) cont
-  = USM (\us -> let (result, us') = expr us in unUSM (cont result) us')
+lazyThenUs expr cont
+  = USM (\us -> let (result, us') = liftUSM expr us in unUSM (cont result) us')
 
 thenUs_ :: UniqSM a -> UniqSM b -> UniqSM b
 thenUs_ (USM expr) (USM cont)
-  = USM (\us -> case (expr us) of { (_, us') -> cont us' })
+  = USM (\us -> case (expr us) of { (# _, us' #) -> cont us' })
 
 returnUs :: a -> UniqSM a
-returnUs result = USM (\us -> (result, us))
+returnUs result = USM (\us -> (# result, us #))
 
 getUs :: UniqSM UniqSupply
-getUs = USM (\us -> splitUniqSupply us)
+getUs = USM (\us -> case splitUniqSupply us of (us1,us2) -> (# us1, us2 #))
 
 -- | A monad for generating unique identifiers
 class Monad m => MonadUnique m where
@@ -177,17 +180,17 @@ class Monad m => MonadUnique m where
     getUniquesM = liftM uniqsFromSupply getUniqueSupplyM
 
 instance MonadUnique UniqSM where
-    getUniqueSupplyM = USM (\us -> splitUniqSupply us)
+    getUniqueSupplyM = getUs
     getUniqueM  = getUniqueUs
     getUniquesM = getUniquesUs
 
 getUniqueUs :: UniqSM Unique
 getUniqueUs = USM (\us -> case splitUniqSupply us of
-                          (us1,us2) -> (uniqFromSupply us1, us2))
+                          (us1,us2) -> (# uniqFromSupply us1, us2 #))
 
 getUniquesUs :: UniqSM [Unique]
 getUniquesUs = USM (\us -> case splitUniqSupply us of
-                           (us1,us2) -> (uniqsFromSupply us1, us2))
+                           (us1,us2) -> (# uniqsFromSupply us1, us2 #))
 \end{code}
 
 \begin{code}
diff --git a/compiler/cmm/BlockId.hs b/compiler/cmm/BlockId.hs
index feeacb553d..4aedcb7074 100644
--- a/compiler/cmm/BlockId.hs
+++ b/compiler/cmm/BlockId.hs
@@ -15,7 +15,7 @@ import Outputable
 import Unique
 
 import Compiler.Hoopl as Hoopl hiding (Unique)
-import Compiler.Hoopl.GHC (uniqueToInt, uniqueToLbl, lblToUnique)
+import Compiler.Hoopl.Internals (uniqueToLbl)
 
 ----------------------------------------------------------------
 --- Block Ids, their environments, and their sets
@@ -31,15 +31,9 @@ compilation unit in which it appears.
 
 type BlockId = Hoopl.Label
 
-instance Uniquable BlockId where
-  getUnique label = getUnique (uniqueToInt $ lblToUnique label)
-
 mkBlockId :: Unique -> BlockId
 mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
 
-instance Outputable BlockId where
-  ppr label = ppr (getUnique label)
-
 retPtLbl :: BlockId -> CLabel
 retPtLbl label = mkReturnPtLabel $ getUnique label
 
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index f1318c1dc9..1c77409e49 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -32,9 +32,9 @@ module Cmm (
 import CLabel
 import BlockId
 import CmmNode
-import OptimizationFuel as F
 import SMRep
 import CmmExpr
+import UniqSupply
 import Compiler.Hoopl
 
 import Data.Word        ( Word8 )
@@ -69,8 +69,6 @@ type CmmGroup = GenCmmGroup CmmStatics CmmTopInfo CmmGraph
 --   (a) C--, i.e. populated with various C-- constructs
 --       (Cmm and RawCmm in OldCmm.hs)
 --   (b) Native code, populated with data/instructions
---
--- A second family of instances based on Hoopl is in Cmm.hs.
 
 -- | A top-level chunk, abstracted over the type of the contents of
 -- the basic blocks (Cmm or instructions are the likely instantiations).
@@ -95,19 +93,23 @@ data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
 type CmmBlock = Block CmmNode C C
 
 type CmmReplGraph e x = GenCmmReplGraph CmmNode e x
-type GenCmmReplGraph n e x = FuelUniqSM (Maybe (Graph n e x))
-type CmmFwdRewrite f = FwdRewrite FuelUniqSM CmmNode f
-type CmmBwdRewrite f = BwdRewrite FuelUniqSM CmmNode f
+type GenCmmReplGraph n e x = UniqSM (Maybe (Graph n e x))
+type CmmFwdRewrite f = FwdRewrite UniqSM CmmNode f
+type CmmBwdRewrite f = BwdRewrite UniqSM CmmNode f
 
 -----------------------------------------------------------------------------
 --     Info Tables
 -----------------------------------------------------------------------------
 
-data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
+data CmmTopInfo   = TopInfo { info_tbl :: CmmInfoTable
+                            , stack_info :: CmmStackInfo }
 
 data CmmStackInfo
    = StackInfo {
-       arg_space :: ByteOff,            -- XXX: comment?
+       arg_space :: ByteOff,
+               -- number of bytes of arguments on the stack on entry to the
+               -- the proc.  This is filled in by StgCmm.codeGen, and used
+               -- by the stack allocator later.
        updfr_space :: Maybe ByteOff     -- XXX: comment?
    }
 
diff --git a/compiler/cmm/CmmBuildInfoTables.hs b/compiler/cmm/CmmBuildInfoTables.hs
index 011947f55d..ebe755219b 100644
--- a/compiler/cmm/CmmBuildInfoTables.hs
+++ b/compiler/cmm/CmmBuildInfoTables.hs
@@ -14,169 +14,53 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
 module CmmBuildInfoTables
     ( CAFSet, CAFEnv, cafAnal, localCAFInfo, mkTopCAFInfo
-    , setInfoTableSRT, setInfoTableStackMap
+    , setInfoTableSRT
     , TopSRT, emptySRT, srtToData
     , bundleCAFs
-    , lowerSafeForeignCalls
-    , cafTransfers, liveSlotTransfers
-    , mkLiveness )
+    , cafTransfers )
 where
 
 #include "HsVersions.h"
 
 -- These should not be imported here!
-import StgCmmForeign
 import StgCmmUtils
 
-import Constants
 import Digraph
 import qualified Prelude as P
 import Prelude hiding (succ)
-import Util
 
 import BlockId
 import Bitmap
 import CLabel
 import Cmm
 import CmmUtils
-import CmmStackLayout
-import Module
-import FastString
-import ForeignCall
 import IdInfo
 import Data.List
 import Maybes
-import MkGraph as M
-import Control.Monad
 import Name
-import OptimizationFuel
 import Outputable
 import SMRep
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import Data.Map (Map)
 import qualified Data.Map as Map
-import qualified FiniteMap as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+foldSet :: (a -> b -> b) -> b -> Set a -> b
+#if __GLASGOW_HASKELL__ < 704
+foldSet = Set.fold
+#else
+foldSet = Set.foldr
+#endif
 
 ----------------------------------------------------------------
 -- Building InfoTables
 
 
 -----------------------------------------------------------------------
--- Stack Maps
-
--- Given a block ID, we return a representation of the layout of the stack,
--- as suspended before entering that block.
--- (For a return site to a function call, the layout does not include the
---  parameter passing area (or the "return address" on the stack)).
--- If the element is `Nothing`, then it represents a word of the stack that
--- does not contain a live pointer.
--- If the element is `Just` a register, then it represents a live spill slot
--- for a pointer; we assume that a pointer is the size of a word.
--- The head of the list represents the young end of the stack where the infotable
--- pointer for the block `Bid` is stored.
--- The infotable pointer itself is not included in the list.
--- Call areas are also excluded from the list: besides the stuff in the update
--- frame (and the return infotable), call areas should never be live across
--- function calls.
-
--- RTS Invariant: All pointers must be word-aligned because each bit in the bitmap
--- represents a word. Consequently, we have to be careful when we see a live slot
--- on the stack: if we have packed multiple sub-word values into a word,
--- we have to make sure that we only mark the entire word as a non-pointer.
-
--- Also, don't forget to stop at the old end of the stack (oldByte),
--- which may differ depending on whether there is an update frame.
-
-type RegSlotInfo
-   = ( Int	  -- Offset from oldest byte of Old area
-     , LocalReg   -- The register
-     , Int)       -- Width of the register
-
-live_ptrs :: ByteOff -> BlockEnv SubAreaSet -> AreaMap -> BlockId -> StackLayout
-live_ptrs oldByte slotEnv areaMap bid =
-  -- pprTrace "live_ptrs for" (ppr bid <+> text (show oldByte ++ "-" ++ show youngByte) <+>
-  --                           ppr liveSlots) $
-  -- pprTrace ("stack layout for " ++ show bid ++ ": ") (ppr res) $ res
-  res
-  where 
-        res = mkLiveness (reverse $ slotsToList youngByte liveSlots [])
- 
-        slotsToList :: Int -> [RegSlotInfo] -> [Maybe LocalReg] -> [Maybe LocalReg]
-        -- n starts at youngByte and is decremented down to oldByte
-	-- Returns a list, one element per word, with 
-	--    (Just r) meaning 'pointer register r is saved here', 
-	--    Nothing  meaning 'non-pointer or empty'
-
-        slotsToList n [] results | n == oldByte = results -- at old end of stack frame
-
-        slotsToList n (s : _) _  | n == oldByte =
-          pprPanic "slot left off live_ptrs" (ppr s <+> ppr oldByte <+>
-               ppr n <+> ppr liveSlots <+> ppr youngByte)
-
-        slotsToList n _ _ | n < oldByte =
-          panic "stack slots not allocated on word boundaries?"
-
-        slotsToList n l@((n', r, w) : rst) results =
-          if n == (n' + w) then -- slot's young byte is at n
-            ASSERT (not (isPtr r) ||
-                    (n `mod` wORD_SIZE == 0 && w == wORD_SIZE)) -- ptrs must be aligned
-            slotsToList next (dropWhile (non_ptr_younger_than next) rst)
-                        (stack_rep : results)
-          else slotsToList next (dropWhile (non_ptr_younger_than next) l)
-                           (Nothing : results)
-          where next = n - wORD_SIZE
-                stack_rep = if isPtr r then Just r else Nothing
-
-        slotsToList n [] results = slotsToList (n - wORD_SIZE) [] (Nothing : results)
-
-        non_ptr_younger_than next (n', r, w) =
-          n' + w > next &&
-            ASSERT (not (isPtr r))
-            True
-        isPtr = isGcPtrType . localRegType
-
-        liveSlots :: [RegSlotInfo]
-        liveSlots = sortBy (\ (off,_,_) (off',_,_) -> compare off' off)
-                           (Map.foldRightWithKey (\_ -> flip $ foldl add_slot) [] slots)
-                    
-        add_slot :: [RegSlotInfo] -> SubArea -> [RegSlotInfo]
-        add_slot rst (a@(RegSlot r@(LocalReg _ ty)), off, w) = 
-          if off == w && widthInBytes (typeWidth ty) == w then
-            (expectJust "add_slot" (Map.lookup a areaMap), r, w) : rst
-          else panic "live_ptrs: only part of a variable live at a proc point"
-        add_slot rst (CallArea Old, _, _) =
-          rst -- the update frame (or return infotable) should be live
-              -- would be nice to check that only that part of the callarea is live...
-        add_slot rst ((CallArea _), _, _) =
-          rst
-          -- JD: THIS ISN'T CURRENTLY A CORRECTNESS PROBLEM, BUT WE SHOULD REALLY
-          -- MAKE LIVENESS INFO AROUND CALLS MORE PRECISE -- FOR NOW, A 32-BIT
-          -- FLOAT PADS OUT TO 64 BITS, BUT WE ASSUME THE WHOLE PARAMETER-PASSING
-          -- AREA IS LIVE (WHICH IT ISN'T...).  WE SHOULD JUST PUT THE LIVE AREAS
-          -- IN THE CALL NODES, WHICH SHOULD EVENTUALLY HAVE LIVE REGISTER AS WELL,
-          -- SO IT'S ALL GOING IN THE SAME DIRECTION.
-          -- pprPanic "CallAreas must not be live across function calls" (ppr bid <+> ppr c)
-
-        slots :: SubAreaSet	 -- The SubAreaSet for 'bid'
-        slots = expectJust "live_ptrs slots" $ mapLookup bid slotEnv
-        youngByte = expectJust "live_ptrs bid_pos" $ Map.lookup (CallArea (Young bid)) areaMap
-
--- Construct the stack maps for a procedure _if_ it needs an infotable.
--- When wouldn't a procedure need an infotable? If it is a procpoint that
--- is not the successor of a call.
-setInfoTableStackMap :: SlotEnv -> AreaMap -> CmmDecl -> CmmDecl
-setInfoTableStackMap slotEnv areaMap
-     t@(CmmProc (TopInfo {stack_info=StackInfo {updfr_space = Just updfr_off}}) _ 
-                (CmmGraph {g_entry = eid}))
-  = updInfo (const (live_ptrs updfr_off slotEnv areaMap eid)) id t
-setInfoTableStackMap _ _ t = t
-                 
-
-
------------------------------------------------------------------------
 -- SRTs
 
 -- WE NEED AN EXAMPLE HERE.
@@ -191,14 +75,14 @@ setInfoTableStackMap _ _ t = t
 -----------------------------------------------------------------------
 -- Finding the CAFs used by a procedure
 
-type CAFSet = Map CLabel ()
+type CAFSet = Set CLabel
 type CAFEnv = BlockEnv CAFSet
 
 -- First, an analysis to find live CAFs.
 cafLattice :: DataflowLattice CAFSet
-cafLattice = DataflowLattice "live cafs" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case old `Map.union` new of
-                                              new' -> (changeIf $ Map.size new' > Map.size old, new')
+cafLattice = DataflowLattice "live cafs" Set.empty add
+  where add _ (OldFact old) (NewFact new) = case old `Set.union` new of
+                                              new' -> (changeIf $ Set.size new' > Set.size old, new')
 
 cafTransfers :: BwdTransfer CmmNode CAFSet
 cafTransfers = mkBTransfer3 first middle last
@@ -210,11 +94,11 @@ cafTransfers = mkBTransfer3 first middle last
                CmmLit (CmmLabelOff c _)         -> add c set
                CmmLit (CmmLabelDiffOff c1 c2 _) -> add c1 $ add c2 set
                _ -> set
-        add l s = if hasCAF l then Map.insert (toClosureLbl l) () s
+        add l s = if hasCAF l then Set.insert (toClosureLbl l) s
                               else s
 
-cafAnal :: CmmGraph -> FuelUniqSM CAFEnv
-cafAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd cafLattice cafTransfers
+cafAnal :: CmmGraph -> CAFEnv
+cafAnal g = dataflowAnalBwd g [] $ analBwd cafLattice cafTransfers
 
 -----------------------------------------------------------------------
 -- Building the SRTs
@@ -264,15 +148,15 @@ srtToData srt = [CmmData RelocatableReadOnlyData (Statics (lbl srt) tbl)]
 -- we make sure they're all close enough to the bottom of the table that the
 -- bitmap will be able to cover all of them.
 buildSRTs :: TopSRT -> Map CLabel CAFSet -> CAFSet ->
-             FuelUniqSM (TopSRT, Maybe CmmDecl, C_SRT)
+             UniqSM (TopSRT, Maybe CmmDecl, C_SRT)
 buildSRTs topSRT topCAFMap cafs =
-  do let liftCAF lbl () z = -- get CAFs for functions without static closures
-           case Map.lookup lbl topCAFMap of Just cafs -> z `Map.union` cafs
-                                            Nothing   -> Map.insert lbl () z
+  do let liftCAF lbl z = -- get CAFs for functions without static closures
+           case Map.lookup lbl topCAFMap of Just cafs -> z `Set.union` cafs
+                                            Nothing   -> Set.insert lbl z
          -- For each label referring to a function f without a static closure,
          -- replace it with the CAFs that are reachable from f.
          sub_srt topSRT localCafs =
-           let cafs = Map.keys (Map.foldRightWithKey liftCAF Map.empty localCafs)
+           let cafs = Set.elems (foldSet liftCAF Set.empty localCafs)
                mkSRT topSRT =
                  do localSRTs <- procpointSRT (lbl topSRT) (elt_map topSRT) cafs
                     return (topSRT, localSRTs)
@@ -307,7 +191,7 @@ buildSRTs topSRT topCAFMap cafs =
 -- Construct an SRT bitmap.
 -- Adapted from simpleStg/SRT.lhs, which expects Id's.
 procpointSRT :: CLabel -> Map CLabel Int -> [CLabel] ->
-                FuelUniqSM (Maybe CmmDecl, C_SRT)
+                UniqSM (Maybe CmmDecl, C_SRT)
 procpointSRT _ _ [] =
  return (Nothing, NoC_SRT)
 procpointSRT top_srt top_table entries =
@@ -325,7 +209,7 @@ maxBmpSize :: Int
 maxBmpSize = widthInBits wordWidth `div` 2
 
 -- Adapted from codeGen/StgCmmUtils, which converts from SRT to C_SRT.
-to_SRT :: CLabel -> Int -> Int -> Bitmap -> FuelUniqSM (Maybe CmmDecl, C_SRT)
+to_SRT :: CLabel -> Int -> Int -> Bitmap -> UniqSM (Maybe CmmDecl, C_SRT)
 to_SRT top_srt off len bmp
   | len > maxBmpSize || bmp == [fromIntegral srt_escape]
   = do id <- getUniqueM
@@ -373,30 +257,30 @@ mkTopCAFInfo localCAFs = foldl addToTop Map.empty g
           Map.insert l (flatten env cafset) env
         addToTop env (CyclicSCC nodes) =
           let (lbls, cafsets) = unzip nodes
-              cafset  = lbls `Map.deleteList` foldl Map.union Map.empty cafsets
+              cafset  = foldr Set.delete (foldl Set.union Set.empty cafsets) lbls
           in foldl (\env l -> Map.insert l (flatten env cafset) env) env lbls
-        flatten env cafset = Map.foldRightWithKey (lookup env) Map.empty cafset
-        lookup env caf () cafset' =
-          case Map.lookup caf env of Just cafs -> Map.foldRightWithKey add cafset' cafs
-                                     Nothing -> add caf () cafset'
-        add caf () cafset' = Map.insert caf () cafset'
+        flatten env cafset = foldSet (lookup env) Set.empty cafset
+        lookup env caf cafset' =
+          case Map.lookup caf env of Just cafs -> foldSet add cafset' cafs
+                                     Nothing -> add caf cafset'
+        add caf cafset' = Set.insert caf cafset'
         g = stronglyConnCompFromEdgedVertices
-              (map (\n@(l, cafs) -> (n, l, Map.keys cafs)) localCAFs)
+              (map (\n@(l, cafs) -> (n, l, Set.elems cafs)) localCAFs)
 
 -- Bundle the CAFs used at a procpoint.
 bundleCAFs :: CAFEnv -> CmmDecl -> (CAFSet, CmmDecl)
 bundleCAFs cafEnv t@(CmmProc _ _ (CmmGraph {g_entry=entry})) =
   (expectJust "bundleCAFs" (mapLookup entry cafEnv), t)
-bundleCAFs _ t = (Map.empty, t)
+bundleCAFs _ t = (Set.empty, t)
 
 -- Construct the SRTs for the given procedure.
 setInfoTableSRT :: Map CLabel CAFSet -> TopSRT -> (CAFSet, CmmDecl) ->
-                   FuelUniqSM (TopSRT, [CmmDecl])
+                   UniqSM (TopSRT, [CmmDecl])
 setInfoTableSRT topCAFMap topSRT (cafs, t) =
   setSRT cafs topCAFMap topSRT t
 
 setSRT :: CAFSet -> Map CLabel CAFSet -> TopSRT ->
-          CmmDecl -> FuelUniqSM (TopSRT, [CmmDecl])
+          CmmDecl -> UniqSM (TopSRT, [CmmDecl])
 setSRT cafs topCAFMap topSRT t =
   do (topSRT, cafTable, srt) <- buildSRTs topSRT topCAFMap cafs
      let t' = updInfo id (const srt) t
@@ -418,91 +302,3 @@ updInfoTbl toVars toSrt info_tbl@(CmmInfoTable {})
                            StackRep ls -> StackRep (toVars ls)
                            other       -> other }
 updInfoTbl _ _ t@CmmNonInfoTable = t
-  
-----------------------------------------------------------------
--- Safe foreign calls: We need to insert the code that suspends and resumes
--- the thread before and after a safe foreign call.
--- Why do we do this so late in the pipeline?
--- Because we need this code to appear without interrruption: you can't rely on the
--- value of the stack pointer between the call and resetting the thread state;
--- you need to have an infotable on the young end of the stack both when
--- suspending the thread and making the foreign call.
--- All of this is much easier if we insert the suspend and resume calls here.
-
--- At the same time, we prepare for the stages of the compiler that
--- build the proc points. We have to do this at the same time because
--- the safe foreign calls need special treatment with respect to infotables.
--- A safe foreign call needs an infotable even though it isn't
--- a procpoint. The following datatype captures the information
--- needed to generate the infotables along with the Cmm data and procedures.
-
--- JD: Why not do this while splitting procedures?
-lowerSafeForeignCalls :: AreaMap -> CmmDecl -> FuelUniqSM CmmDecl
-lowerSafeForeignCalls _ t@(CmmData _ _) = return t
-lowerSafeForeignCalls areaMap (CmmProc info l g@(CmmGraph {g_entry=entry})) = do
-  let block b mblocks = mblocks >>= lowerSafeCallBlock entry areaMap b
-  blocks <- foldGraphBlocks block (return mapEmpty) g
-  return $ CmmProc info l (ofBlockMap entry blocks)
-
--- If the block ends with a safe call in the block, lower it to an unsafe
--- call (with appropriate saves and restores before and after).
-lowerSafeCallBlock :: BlockId -> AreaMap -> CmmBlock -> BlockEnv CmmBlock
-                              -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeCallBlock entry areaMap b blocks =
-  case blockToNodeList b of
-    (JustC (CmmEntry id), m, JustC l@(CmmForeignCall {})) -> lowerSafeForeignCall entry areaMap blocks id m l
-    _                                                    -> return $ insertBlock b blocks
-
--- Late in the code generator, we want to insert the code necessary
--- to lower a safe foreign call to a sequence of unsafe calls.
-lowerSafeForeignCall :: BlockId -> AreaMap -> BlockEnv CmmBlock -> BlockId -> [CmmNode O O] -> CmmNode O C
-                                -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeForeignCall entry areaMap blocks bid m
-    (CmmForeignCall {tgt=tgt, res=rs, args=as, succ=succ, updfr = updfr_off, intrbl = intrbl}) =
- do let newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
-    -- Both 'id' and 'new_base' are KindNonPtr because they're
-    -- RTS-only objects and are not subject to garbage collection
-    id <- newTemp bWord
-    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
-    let (caller_save, caller_load) = callerSaveVolatileRegs
-    load_tso <- newTemp gcWord -- TODO FIXME NOW
-    load_stack <- newTemp gcWord -- TODO FIXME NOW
-    let (<**>) = (M.<*>)
-    let suspendThread = foreignLbl "suspendThread"
-        resumeThread  = foreignLbl "resumeThread"
-        foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit name)))
-        suspend = saveThreadState <**>
-                  caller_save <**>
-                  mkUnsafeCall (ForeignTarget suspendThread
-                                (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
-                               [id] [CmmReg (CmmGlobal BaseReg), CmmLit (CmmInt (fromIntegral (fromEnum intrbl)) wordWidth)]
-        midCall = mkUnsafeCall tgt rs as
-        resume  = mkUnsafeCall (ForeignTarget resumeThread
-                                (ForeignConvention CCallConv [AddrHint] [AddrHint]))
-                     [new_base] [CmmReg (CmmLocal id)] <**>
-                  -- Assign the result to BaseReg: we
-                  -- might now have a different Capability!
-                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <**>
-                  caller_load <**>
-                  loadThreadState load_tso load_stack
-        -- We have to save the return value on the stack because its next use
-        -- may appear in a different procedure due to procpoint splitting...
-        saveRetVals = foldl (<**>) emptyAGraph $ map (M.mkMiddle . spill) rs
-        spill r = CmmStore (regSlot r) (CmmReg $ CmmLocal r)
-        regSlot r@(LocalReg _ _) = CmmRegOff (CmmGlobal Sp) (sp_off - offset)
-          where offset = w + expectJust "lowerForeign" (Map.lookup (RegSlot r) areaMap)
-                sp_off = wORD_SIZE + expectJust "lowerForeign" (Map.lookup (CallArea area) areaMap)
-                area = if succ == entry then Old else Young succ
-                w = widthInBytes $ typeWidth $ localRegType r
-        -- Note: The successor must be a procpoint, and we have already split,
-        --       so we use a jump, not a branch.
-        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
-        jump = CmmCall { cml_target  = succLbl, cml_cont = Nothing
-                       , cml_args    = widthInBytes wordWidth ,cml_ret_args = 0
-                       , cml_ret_off = updfr_off}
-    graph' <- liftUniq $ labelAGraph bid $ catAGraphs (map M.mkMiddle m) <**>
-                                           suspend <**> midCall <**>
-                                           resume  <**> saveRetVals <**> M.mkLast jump
-    return $ blocks `mapUnion` toBlockMap graph'
-lowerSafeForeignCall _ _ _ _ _ _ = panic "lowerSafeForeignCall was passed something else"
-
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index d3d9ba4b41..484e89cd9b 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -53,7 +53,6 @@ assignArgumentsPos conv arg_ty reps = assignments
                ([_], PrimOpReturn)     -> allRegs
                (_,   PrimOpReturn)     -> getRegsWithNode
                (_,   Slow)             -> noRegs
-               _ -> pprPanic "Unknown calling convention" (ppr conv)
       -- The calling conventions first assign arguments to registers,
       -- then switch to the stack when we first run out of registers
       -- (even if there are still available registers for args of a different type).
diff --git a/compiler/cmm/CmmCommonBlockElim.hs b/compiler/cmm/CmmCommonBlockElim.hs
index abbfd01156..eafa2a00f3 100644
--- a/compiler/cmm/CmmCommonBlockElim.hs
+++ b/compiler/cmm/CmmCommonBlockElim.hs
@@ -13,22 +13,22 @@ where
 import BlockId
 import Cmm
 import CmmUtils
+import CmmContFlowOpt
 import Prelude hiding (iterate, succ, unzip, zip)
 
-import Compiler.Hoopl
+import Hoopl hiding (ChangeFlag)
 import Data.Bits
 import qualified Data.List as List
 import Data.Word
-import FastString
-import Control.Monad
 import Outputable
 import UniqFM
-import Unique
 
 my_trace :: String -> SDoc -> a -> a
 my_trace = if False then pprTrace else \_ _ a -> a
 
--- Eliminate common blocks:
+-- -----------------------------------------------------------------------------
+-- Eliminate common blocks
+
 -- If two blocks are identical except for the label on the first node,
 -- then we can eliminate one of the blocks. To ensure that the semantics
 -- of the program are preserved, we have to rewrite each predecessor of the
@@ -42,59 +42,50 @@ my_trace = if False then pprTrace else \_ _ a -> a
 
 -- TODO: Use optimization fuel
 elimCommonBlocks :: CmmGraph -> CmmGraph
-elimCommonBlocks g =
-    upd_graph g . snd $ iterate common_block reset hashed_blocks
-                                (emptyUFM, mapEmpty)
-      where hashed_blocks    = map (\b -> (hash_block b, b)) (reverse (postorderDfs g))
-            reset (_, subst) = (emptyUFM, subst)
+elimCommonBlocks g = replaceLabels env g
+  where
+     env = iterate hashed_blocks mapEmpty
+     hashed_blocks = map (\b -> (hash_block b, b)) $ postorderDfs g
 
 -- Iterate over the blocks until convergence
-iterate :: (t -> a -> (Bool, t)) -> (t -> t) -> [a] -> t -> t
-iterate upd reset blocks state =
-  case foldl upd' (False, state) blocks of
-    (True,  state') -> iterate upd reset blocks (reset state')
-    (False, state') -> state'
-  where upd' (b, s) a = let (b', s') = upd s a in (b || b', s') -- lift to track changes
+iterate :: [(HashCode,CmmBlock)] -> BlockEnv BlockId -> BlockEnv BlockId
+iterate blocks subst =
+  case foldl common_block (False, emptyUFM, subst) blocks of
+    (changed,  _, subst)
+       | changed   -> iterate blocks subst
+       | otherwise -> subst
+
+type State  = (ChangeFlag, UniqFM [CmmBlock], BlockEnv BlockId)
+
+type ChangeFlag = Bool
+type HashCode = Int
 
 -- Try to find a block that is equal (or ``common'') to b.
-type BidMap = BlockEnv BlockId
-type State  = (UniqFM [CmmBlock], BidMap)
-common_block :: (Outputable h, Uniquable h) =>  State -> (h, CmmBlock) -> (Bool, State)
-common_block (bmap, subst) (hash, b) =
+common_block :: State -> (HashCode, CmmBlock) -> State
+common_block (old_change, bmap, subst) (hash, b) =
   case lookupUFM bmap hash of
     Just bs -> case (List.find (eqBlockBodyWith (eqBid subst) b) bs,
                      mapLookup bid subst) of
                  (Just b', Nothing)                         -> addSubst b'
                  (Just b', Just b'') | entryLabel b' /= b'' -> addSubst b'
-                 _ -> (False, (addToUFM bmap hash (b : bs), subst))
-    Nothing -> (False, (addToUFM bmap hash [b], subst))
+                                     | otherwise -> (old_change, bmap, subst)
+                 _ -> (old_change, addToUFM bmap hash (b : bs), subst)
+    Nothing -> (old_change, addToUFM bmap hash [b], subst)
   where bid = entryLabel b
-        addSubst b' = my_trace "found new common block" (ppr (entryLabel b')) $
-                      (True, (bmap, mapInsert bid (entryLabel b') subst))
-
--- Given the map ``subst'' from BlockId -> BlockId, we rewrite the graph.
-upd_graph :: CmmGraph -> BidMap -> CmmGraph
-upd_graph g subst = mapGraphNodes (id, middle, last) g
-  where middle = mapExpDeep exp
-        last l = last' (mapExpDeep exp l)
-        last' :: CmmNode O C -> CmmNode O C
-        last' (CmmBranch bid)              = CmmBranch $ sub bid
-        last' (CmmCondBranch p t f)        = cond p (sub t) (sub f)
-        last' (CmmCall t (Just bid) a r o) = CmmCall t (Just $ sub bid) a r o
-        last' l@(CmmCall _ Nothing _ _ _)  = l
-        last' (CmmForeignCall t r a bid u i) = CmmForeignCall t r a (sub bid) u i
-        last' (CmmSwitch e bs)             = CmmSwitch e $ map (liftM sub) bs
-        cond p t f = if t == f then CmmBranch t else CmmCondBranch p t f
-        exp (CmmStackSlot (CallArea (Young id))       off) =
-             CmmStackSlot (CallArea (Young (sub id))) off
-        exp (CmmLit (CmmBlock id)) = CmmLit (CmmBlock (sub id))
-        exp e = e
-        sub = lookupBid subst
+        addSubst b' = my_trace "found new common block" (ppr bid <> char '=' <> ppr (entryLabel b')) $
+                      (True, bmap, mapInsert bid (entryLabel b') subst)
+
+
+-- -----------------------------------------------------------------------------
+-- Hashing and equality on blocks
+
+-- Below here is mostly boilerplate: hashing blocks ignoring labels,
+-- and comparing blocks modulo a label mapping.
 
 -- To speed up comparisons, we hash each basic block modulo labels.
 -- The hashing is a bit arbitrary (the numbers are completely arbitrary),
 -- but it should be fast and good enough.
-hash_block :: CmmBlock -> Int
+hash_block :: CmmBlock -> HashCode
 hash_block block =
   fromIntegral (foldBlockNodesB3 (hash_fst, hash_mid, hash_lst) block (0 :: Word32) .&. (0x7fffffff :: Word32))
   -- UniqFM doesn't like negative Ints
@@ -103,11 +94,11 @@ hash_block block =
         hash_lst m h = hash_node m + h `shiftL` 1
 
         hash_node :: CmmNode O x -> Word32
-        hash_node (CmmComment (FastString u _ _ _ _)) = cvt u
+        hash_node (CmmComment _) = 0 -- don't care
         hash_node (CmmAssign r e) = hash_reg r + hash_e e
         hash_node (CmmStore e e') = hash_e e + hash_e e'
         hash_node (CmmUnsafeForeignCall t _ as) = hash_tgt t + hash_list hash_e as
-        hash_node (CmmBranch _) = 23 -- would be great to hash these properly
+        hash_node (CmmBranch _) = 23 -- NB. ignore the label
         hash_node (CmmCondBranch p _ _) = hash_e p
         hash_node (CmmCall e _ _ _ _) = hash_e e
         hash_node (CmmForeignCall t _ _ _ _ _) = hash_tgt t
@@ -143,18 +134,60 @@ hash_block block =
 -- Utilities: equality and substitution on the graph.
 
 -- Given a map ``subst'' from BlockID -> BlockID, we define equality.
-eqBid :: BidMap -> BlockId -> BlockId -> Bool
+eqBid :: BlockEnv BlockId -> BlockId -> BlockId -> Bool
 eqBid subst bid bid' = lookupBid subst bid == lookupBid subst bid'
-lookupBid :: BidMap -> BlockId -> BlockId
+lookupBid :: BlockEnv BlockId -> BlockId -> BlockId
 lookupBid subst bid = case mapLookup bid subst of
                         Just bid  -> lookupBid subst bid
                         Nothing -> bid
 
--- Equality on the body of a block, modulo a function mapping block IDs to block IDs.
+-- Middle nodes and expressions can contain BlockIds, in particular in
+-- CmmStackSlot and CmmBlock, so we have to use a special equality for
+-- these.
+--
+eqMiddleWith :: (BlockId -> BlockId -> Bool)
+             -> CmmNode O O -> CmmNode O O -> Bool
+eqMiddleWith _ (CmmComment _) (CmmComment _) = True
+eqMiddleWith eqBid (CmmAssign r1 e1) (CmmAssign r2 e2)
+  = r1 == r2 && eqExprWith eqBid e1 e2
+eqMiddleWith eqBid (CmmStore l1 r1) (CmmStore l2 r2)
+  = eqExprWith eqBid l1 l2 && eqExprWith eqBid r1 r2
+eqMiddleWith eqBid (CmmUnsafeForeignCall t1 r1 a1)
+                   (CmmUnsafeForeignCall t2 r2 a2)
+  = t1 == t2 && r1 == r2 && and (zipWith (eqExprWith eqBid) a1 a2)
+eqMiddleWith _ _ _ = False
+
+eqExprWith :: (BlockId -> BlockId -> Bool)
+           -> CmmExpr -> CmmExpr -> Bool
+eqExprWith eqBid = eq
+ where
+  CmmLit l1          `eq` CmmLit l2          = eqLit l1 l2
+  CmmLoad e1 _       `eq` CmmLoad e2 _       = e1 `eq` e2
+  CmmReg r1          `eq` CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    `eq` CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  `eq` CmmMachOp op2 es2  = op1==op2 && es1 `eqs` es2
+  CmmStackSlot a1 i1 `eq` CmmStackSlot a2 i2 = eqArea a1 a2 && i1==i2
+  _e1                `eq` _e2                = False
+
+  xs `eqs` ys = and (zipWith eq xs ys)
+
+  eqLit (CmmBlock id1) (CmmBlock id2) = eqBid id1 id2
+  eqLit l1 l2 = l1 == l2
+
+  eqArea Old Old = True
+  eqArea (Young id1) (Young id2) = eqBid id1 id2
+  eqArea _ _ = False
+
+-- Equality on the body of a block, modulo a function mapping block
+-- IDs to block IDs.
 eqBlockBodyWith :: (BlockId -> BlockId -> Bool) -> CmmBlock -> CmmBlock -> Bool
-eqBlockBodyWith eqBid block block' = middles == middles' && eqLastWith eqBid last last'
-  where (_, middles , JustC last  :: MaybeC C (CmmNode O C)) = blockToNodeList block
-        (_, middles', JustC last' :: MaybeC C (CmmNode O C)) = blockToNodeList block'
+eqBlockBodyWith eqBid block block'
+  = and (zipWith (eqMiddleWith eqBid) (blockToList m) (blockToList m')) &&
+    eqLastWith eqBid l l'
+  where (_,m,l)   = blockSplit block
+        (_,m',l') = blockSplit block'
+
+
 
 eqLastWith :: (BlockId -> BlockId -> Bool) -> CmmNode O C -> CmmNode O C -> Bool
 eqLastWith eqBid (CmmBranch bid1) (CmmBranch bid2) = eqBid bid1 bid2
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index 73ce57e93f..3fabf33f97 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -2,19 +2,19 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations -fno-warn-incomplete-patterns #-}
 
 module CmmContFlowOpt
-    ( runCmmContFlowOpts
-    , removeUnreachableBlocks, replaceBranches
+    ( cmmCfgOpts
+    , cmmCfgOptsProc
+    , removeUnreachableBlocks
+    , replaceLabels
     )
 where
 
 import BlockId
 import Cmm
 import CmmUtils
-import Digraph
 import Maybes
-import Outputable
 
-import Compiler.Hoopl
+import Hoopl
 import Control.Monad
 import Prelude hiding (succ, unzip, zip)
 
@@ -24,104 +24,158 @@ import Prelude hiding (succ, unzip, zip)
 --
 -----------------------------------------------------------------------------
 
-runCmmContFlowOpts :: CmmGroup -> CmmGroup
-runCmmContFlowOpts = map (optProc cmmCfgOpts)
-
 cmmCfgOpts :: CmmGraph -> CmmGraph
-cmmCfgOpts = removeUnreachableBlocks . blockConcat . branchChainElim
-        -- Here branchChainElim can ultimately be replaced
-        -- with a more exciting combination of optimisations
+cmmCfgOpts = removeUnreachableBlocks . blockConcat
+
+cmmCfgOptsProc :: CmmDecl -> CmmDecl
+cmmCfgOptsProc = optProc cmmCfgOpts
 
 optProc :: (g -> g) -> GenCmmDecl d h g -> GenCmmDecl d h g
 optProc opt (CmmProc info lbl g) = CmmProc info lbl (opt g)
 optProc _   top                  = top
 
+
 -----------------------------------------------------------------------------
 --
--- Branch Chain Elimination
+-- Block concatenation
 --
 -----------------------------------------------------------------------------
 
--- | Remove any basic block of the form L: goto L', and replace L with
--- L' everywhere else, unless L is the successor of a call instruction
--- and L' is the entry block. You don't want to set the successor of a
--- function call to the entry block because there is no good way to
--- store both the infotables for the call and from the callee, while
--- putting the stack pointer in a consistent place.
+-- This optimisation does two things:
+--   - If a block finishes with an unconditional branch, then we may
+--     be able to concatenate the block it points to and remove the
+--     branch.  We do this either if the destination block is small
+--     (e.g. just another branch), or if this is the only jump to
+--     this particular destination block.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto.
+--
+-- Both transformations are improved by working from the end of the
+-- graph towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- We need to walk over the blocks from the end back to the
+-- beginning.  We are going to maintain the "current" graph
+-- (BlockEnv CmmBlock) as we go, and also a mapping from BlockId
+-- to BlockId, representing continuation labels that we have
+-- renamed.  This latter mapping is important because we might
+-- shortcut a CmmCall continuation.  For example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--
+--    L: goto M
 --
--- JD isn't quite sure when it's safe to share continuations for different
--- function calls -- have to think about where the SP will be,
--- so we'll table that problem for now by leaving all call successors alone.
-
-branchChainElim :: CmmGraph -> CmmGraph
-branchChainElim g
-  | null lone_branch_blocks = g    -- No blocks to remove
-  | otherwise               = {- pprTrace "branchChainElim" (ppr forest) $ -}
-                              replaceLabels (mapFromList edges) g
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L).  So we keep track of
+-- which labels we have renamed and apply the mapping at the end
+-- with replaceLabels.
+
+blockConcat  :: CmmGraph -> CmmGraph
+blockConcat g@CmmGraph { g_entry = entry_id }
+  = replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks
   where
-    blocks = toBlockList g
-
-    lone_branch_blocks :: [(BlockId, BlockId)]
-      -- each (L,K) is a block of the form
-      --   L : goto K
-    lone_branch_blocks = mapCatMaybes isLoneBranch blocks
-
-    call_succs = foldl add emptyBlockSet blocks
-      where add :: BlockSet -> CmmBlock -> BlockSet
-            add succs b =
-              case lastNode b of
-                (CmmCall _ (Just k) _ _ _) -> setInsert k succs
-                (CmmForeignCall {succ=k})  -> setInsert k succs
-                _                          -> succs
-
-    isLoneBranch :: CmmBlock -> Maybe (BlockId, BlockId)
-    isLoneBranch block
-      | (JustC (CmmEntry id), [], JustC (CmmBranch target)) <- blockToNodeList block
-      , not (setMember id call_succs)
-      = Just (id,target)
-      | otherwise
-      = Nothing
-
-    -- We build a graph from lone_branch_blocks (every node has only
-    -- one out edge).  Then we
-    --   - topologically sort the graph: if from A we can reach B,
-    --     then A occurs before B in the result list.
-    --   - depth-first search starting from the nodes in this list.
-    --     This gives us a [[node]], in which each list is a dependency
-    --     chain.
-    --   - for each list [a1,a2,...an] replace branches to ai with an.
-    --
-    -- This approach nicely deals with cycles by ignoring them.
-    -- Branches in a cycle will be redirected to somewhere in the
-    -- cycle, but we don't really care where.  A cycle should be dead code,
-    -- and so will be eliminated by removeUnreachableBlocks.
-    --
-    fromNode (b,_) = b
-    toNode   a     = (a,a)
-
-    all_block_ids :: LabelSet
-    all_block_ids = setFromList (map fst lone_branch_blocks)
-                      `setUnion`
-                    setFromList (map snd lone_branch_blocks)
-
-    forest = dfsTopSortG $ graphFromVerticesAndAdjacency nodes lone_branch_blocks
-        where nodes = map toNode $ setElems $ all_block_ids
-
-    edges  = [ (fromNode y, fromNode x)
-             | (x:xs) <- map reverse forest, y <- xs ]
+     -- we might be able to shortcut the entry BlockId itself
+     new_entry
+       | Just entry_blk <- mapLookup entry_id new_blocks
+       , Just dest      <- canShortcut entry_blk
+       = dest
+       | otherwise
+       = entry_id
 
-----------------------------------------------------------------
+     blocks = postorderDfs g
+
+     (new_blocks, shortcut_map) =
+           foldr maybe_concat (toBlockMap g, mapEmpty) blocks
+
+     maybe_concat :: CmmBlock
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+     maybe_concat block (blocks, shortcut_map)
+        | CmmBranch b' <- last
+        , Just blk' <- mapLookup b' blocks
+        , shouldConcatWith b' blk'
+        = (mapInsert bid (splice head blk') blocks, shortcut_map)
+
+        -- calls: if we can shortcut the continuation label, then
+        -- we must *also* remember to substitute for the label in the
+        -- code, because we will push it somewhere.
+        | Just b'   <- callContinuation_maybe last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = (blocks, mapInsert b' dest shortcut_map)
+           -- replaceLabels will substitute dest for b' everywhere, later
+
+        -- non-calls: see if we can shortcut any of the successors.
+        | Nothing <- callContinuation_maybe last
+        = ( mapInsert bid (blockJoinTail head shortcut_last) blocks
+          , shortcut_map )
+
+        | otherwise
+        = (blocks, shortcut_map)
+        where
+          (head, last) = blockSplitTail block
+          bid = entryLabel block
+          shortcut_last = mapSuccessors shortcut last
+          shortcut l =
+             case mapLookup l blocks of
+               Just b | Just dest <- canShortcut b  -> dest
+               _otherwise -> l
+
+     shouldConcatWith b block
+       | num_preds b == 1    = True  -- only one predecessor: go for it
+       | okToDuplicate block = True  -- short enough to duplicate
+       | otherwise           = False
+       where num_preds bid = mapLookup bid backEdges `orElse` 0
+
+     canShortcut :: CmmBlock -> Maybe BlockId
+     canShortcut block
+       | (_, middle, CmmBranch dest) <- blockSplit block
+       , isEmptyBlock middle
+       = Just dest
+       | otherwise
+       = Nothing
+
+     backEdges :: BlockEnv Int -- number of predecessors for each block
+     backEdges = mapInsertWith (+) entry_id 1 $ -- add 1 for the entry id
+                   mapMap setSize $ predMap blocks
+
+     splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+     splice head rest = head `blockAppend` snd (blockSplitHead rest)
+
+
+callContinuation_maybe :: CmmNode O C -> Maybe BlockId
+callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
+callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
+callContinuation_maybe _ = Nothing
+
+okToDuplicate :: CmmBlock -> Bool
+okToDuplicate block
+  = case blockSplit block of (_, m, _) -> isEmptyBlock m
+  -- cheap and cheerful; we might expand this in the future to
+  -- e.g. spot blocks that represent a single instruction or two
+
+------------------------------------------------------------------------
+-- Map over the CmmGraph, replacing each label with its mapping in the
+-- supplied BlockEnv.
 
 replaceLabels :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceLabels env =
-  replace_eid . mapGraphNodes1 txnode
+replaceLabels env g
+  | mapNull env = g
+  | otherwise   = replace_eid $ mapGraphNodes1 txnode g
    where
      replace_eid g = g {g_entry = lookup (g_entry g)}
      lookup id = mapLookup id env `orElse` id
 
      txnode :: CmmNode e x -> CmmNode e x
      txnode (CmmBranch bid)         = CmmBranch (lookup bid)
-     txnode (CmmCondBranch p t f)   = CmmCondBranch (exp p) (lookup t) (lookup f)
+     txnode (CmmCondBranch p t f)   = mkCmmCondBranch (exp p) (lookup t) (lookup f)
      txnode (CmmSwitch e arms)      = CmmSwitch (exp e) (map (liftM lookup) arms)
      txnode (CmmCall t k a res r)   = CmmCall (exp t) (liftM lookup k) a res r
      txnode fc@CmmForeignCall{}     = fc{ args = map exp (args fc)
@@ -130,90 +184,25 @@ replaceLabels env =
 
      exp :: CmmExpr -> CmmExpr
      exp (CmmLit (CmmBlock bid))                = CmmLit (CmmBlock (lookup bid))
-     exp (CmmStackSlot (CallArea (Young id)) i) = CmmStackSlot (CallArea (Young (lookup id))) i
+     exp (CmmStackSlot (Young id) i) = CmmStackSlot (Young (lookup id)) i
      exp e                                      = e
 
-
-replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceBranches env g = mapGraphNodes (id, id, last) g
-  where
-    last :: CmmNode O C -> CmmNode O C
-    last (CmmBranch id)          = CmmBranch (lookup id)
-    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
-    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
-    last l@(CmmCall {})          = l
-    last l@(CmmForeignCall {})   = l
-    lookup id = fmap lookup (mapLookup id env) `orElse` id
-            -- XXX: this is a recursive lookup, it follows chains until the lookup
-            -- returns Nothing, at which point we return the last BlockId
+mkCmmCondBranch :: CmmExpr -> Label -> Label -> CmmNode O C
+mkCmmCondBranch p t f = if t == f then CmmBranch t else CmmCondBranch p t f
 
 ----------------------------------------------------------------
 -- Build a map from a block to its set of predecessors. Very useful.
+
 predMap :: [CmmBlock] -> BlockEnv BlockSet
 predMap blocks = foldr add_preds mapEmpty blocks -- find the back edges
   where add_preds block env = foldl (add (entryLabel block)) env (successors block)
         add bid env b' =
           mapInsert b' (setInsert bid (mapLookup b' env `orElse` setEmpty)) env
 
------------------------------------------------------------------------------
---
--- Block concatenation
---
------------------------------------------------------------------------------
-
--- If a block B branches to a label L, L is not the entry block,
--- and L has no other predecessors,
--- then we can splice the block starting with L onto the end of B.
--- Order matters, so we work bottom up (reverse postorder DFS).
--- This optimization can be inhibited by unreachable blocks, but
--- the reverse postorder DFS returns only reachable blocks.
---
--- To ensure correctness, we have to make sure that the BlockId of the block
--- we are about to eliminate is not named in another instruction.
---
--- Note: This optimization does _not_ subsume branch chain elimination.
-
-blockConcat  :: CmmGraph -> CmmGraph
-blockConcat g@(CmmGraph {g_entry=eid}) =
-  replaceLabels concatMap $ ofBlockMap (g_entry g) blocks'
-  where
-     blocks = postorderDfs g
-
-     (blocks', concatMap) =
-           foldr maybe_concat (toBlockMap g, mapEmpty) $ blocks
-
-     maybe_concat :: CmmBlock -> (LabelMap CmmBlock, LabelMap Label) -> (LabelMap CmmBlock, LabelMap Label)
-     maybe_concat b unchanged@(blocks', concatMap) =
-       let bid = entryLabel b
-       in case blockToNodeList b of
-            (JustC h, m, JustC (CmmBranch b')) ->
-               if canConcatWith b' then
-                 (mapInsert bid (splice blocks' h m b') blocks',
-                  mapInsert b' bid concatMap)
-               else unchanged
-            _ -> unchanged
-
-     num_preds bid = liftM setSize (mapLookup bid backEdges) `orElse` 0
-
-     canConcatWith b' = b' /= eid && num_preds b' == 1
-
-     backEdges = predMap blocks
-
-     splice :: forall map n e x.
-               IsMap map =>
-               map (Block n e x) -> n C O -> [n O O] -> KeyOf map -> Block n C x
-     splice blocks' h m bid' =
-          case mapLookup bid' blocks' of
-            Nothing -> panic "unknown successor block"
-            Just block | (_, m', l') <- blockToNodeList block
-                -> blockOfNodeList (JustC h, (m ++ m'), l')
-
 
 -----------------------------------------------------------------------------
 --
 -- Removing unreachable blocks
---
------------------------------------------------------------------------------
 
 removeUnreachableBlocks :: CmmGraph -> CmmGraph
 removeUnreachableBlocks g
diff --git a/compiler/cmm/CmmCvt.hs b/compiler/cmm/CmmCvt.hs
index 80c6079aac..e72eee041c 100644
--- a/compiler/cmm/CmmCvt.hs
+++ b/compiler/cmm/CmmCvt.hs
@@ -12,29 +12,25 @@ import CmmUtils
 import qualified OldCmm as Old
 import OldPprCmm ()
 
-import Compiler.Hoopl hiding ((<*>), mkLabel, mkBranch)
+import Hoopl hiding ((<*>), mkLabel, mkBranch)
 import Data.Maybe
 import Maybes
 import Outputable
 
 cmmOfZgraph :: CmmGroup -> Old.CmmGroup
 cmmOfZgraph tops = map mapTop tops
-  where mapTop (CmmProc h l g) = CmmProc (Old.CmmInfo Nothing Nothing (info_tbl h)) l (ofZgraph g)
+  where mapTop (CmmProc h l g) = CmmProc (info_tbl h) l (ofZgraph g)
         mapTop (CmmData s ds) = CmmData s ds
 
 data ValueDirection = Arguments | Results
 
-add_hints :: Convention -> ValueDirection -> [a] -> [Old.CmmHinted a]
+add_hints :: ForeignTarget -> ValueDirection -> [a] -> [Old.CmmHinted a]
 add_hints conv vd args = zipWith Old.CmmHinted args (get_hints conv vd)
 
-get_hints :: Convention -> ValueDirection -> [ForeignHint]
-get_hints (Foreign (ForeignConvention _ hints _)) Arguments = hints
-get_hints (Foreign (ForeignConvention _ _ hints)) Results   = hints
-get_hints _other_conv                             _vd       = repeat NoHint
-
-get_conv :: ForeignTarget -> Convention
-get_conv (PrimTarget _)       = NativeNodeCall -- JD: SUSPICIOUS
-get_conv (ForeignTarget _ fc) = Foreign fc
+get_hints :: ForeignTarget -> ValueDirection -> [ForeignHint]
+get_hints (ForeignTarget _ (ForeignConvention _ hints _)) Arguments = hints
+get_hints (ForeignTarget _ (ForeignConvention _ _ hints)) Results   = hints
+get_hints (PrimTarget _) _vd = repeat NoHint
 
 cmm_target :: ForeignTarget -> Old.CmmCallTarget
 cmm_target (PrimTarget op) = Old.CmmPrim op Nothing
@@ -89,8 +85,8 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                             CmmUnsafeForeignCall (PrimTarget MO_Touch) _ _ -> Old.CmmNop
                             CmmUnsafeForeignCall target ress args          -> 
                               Old.CmmCall (cmm_target target)
-                                          (add_hints (get_conv target) Results   ress)
-                                          (add_hints (get_conv target) Arguments args)
+                                          (add_hints target Results   ress)
+                                          (add_hints target Arguments args)
                                           Old.CmmMayReturn
 
                   last :: CmmNode O C -> () -> [Old.CmmStmt]
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 6eb91e89ba..646ecb5c67 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,11 +14,11 @@ module CmmExpr
     , GlobalReg(..), globalRegType, spReg, hpReg, spLimReg, nodeReg, node, baseReg
     , VGcPtr(..), vgcFlag 	-- Temporary!
     , DefinerOfLocalRegs, UserOfLocalRegs, foldRegsDefd, foldRegsUsed, filterRegsUsed
-    , DefinerOfSlots, UserOfSlots, foldSlotsDefd, foldSlotsUsed
     , RegSet, emptyRegSet, elemRegSet, extendRegSet, deleteFromRegSet, mkRegSet
-            , plusRegSet, minusRegSet, timesRegSet
-    , regUsedIn, regSlot
-    , Area(..), AreaId(..), SubArea, SubAreaSet, AreaMap, isStackSlotOf
+            , plusRegSet, minusRegSet, timesRegSet, sizeRegSet, nullRegSet
+            , regSetToList
+    , regUsedIn
+    , Area(..)
     , module CmmMachOp
     , module CmmType
     )
@@ -31,9 +31,9 @@ import CmmMachOp
 import BlockId
 import CLabel
 import Unique
-import UniqSet
 
-import Data.Map (Map)
+import Data.Set (Set)
+import qualified Data.Set as Set
 
 -----------------------------------------------------------------------------
 --		CmmExpr
@@ -42,11 +42,12 @@ import Data.Map (Map)
 
 data CmmExpr
   = CmmLit CmmLit               -- Literal
-  | CmmLoad CmmExpr CmmType     -- Read memory location
-  | CmmReg CmmReg		-- Contents of register
+  | CmmLoad !CmmExpr !CmmType   -- Read memory location
+  | CmmReg !CmmReg              -- Contents of register
   | CmmMachOp MachOp [CmmExpr]  -- Machine operation (+, -, *, etc.)
-  | CmmStackSlot Area Int       -- addressing expression of a stack slot
-  | CmmRegOff CmmReg Int	
+  | CmmStackSlot Area {-# UNPACK #-} !Int
+                                -- addressing expression of a stack slot
+  | CmmRegOff !CmmReg Int
 	-- CmmRegOff reg i
 	--        ** is shorthand only, meaning **
 	-- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
@@ -62,20 +63,16 @@ instance Eq CmmExpr where	-- Equality ignores the types
   _e1               == _e2               = False
 
 data CmmReg 
-  = CmmLocal  LocalReg
+  = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
   deriving( Eq, Ord )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
 data Area
-  = RegSlot  LocalReg
-  | CallArea AreaId
-  deriving (Eq, Ord)
-
-data AreaId
   = Old            -- See Note [Old Area]
-  | Young BlockId
+  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
+                   -- See Note [Continuation BlockId] in CmmNode.
   deriving (Eq, Ord)
 
 {- Note [Old Area] 
@@ -94,15 +91,8 @@ necessarily at the young end of the Old area.
 
 End of note -}
 
-type SubArea    = (Area, Int, Int) -- area, offset, width
-type SubAreaSet = Map Area [SubArea]
-
-type AreaMap    = Map Area Int
-     -- Byte offset of the oldest byte of the Area, 
-     -- relative to the oldest byte of the Old Area
-
 data CmmLit
-  = CmmInt Integer  Width
+  = CmmInt !Integer  Width
 	-- Interpretation: the 2's complement representation of the value
 	-- is truncated to the specified size.  This is easier than trying
 	-- to keep the value within range, because we don't know whether
@@ -120,7 +110,11 @@ data CmmLit
         -- It is also used inside the NCG during when generating
         -- position-independent code. 
   | CmmLabelDiffOff CLabel CLabel Int   -- label1 - label2 + offset
-  | CmmBlock BlockId			-- Code label
+
+  | CmmBlock {-# UNPACK #-} !BlockId     -- Code label
+        -- Invariant: must be a continuation BlockId
+        -- See Note [Continuation BlockId] in CmmNode.
+
   | CmmHighStackMark -- stands for the max stack space used during a procedure
   deriving Eq
 
@@ -163,7 +157,7 @@ maybeInvertCmmExpr _ = Nothing
 -----------------------------------------------------------------------------
 
 data LocalReg
-  = LocalReg !Unique CmmType
+  = LocalReg {-# UNPACK #-} !Unique CmmType
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
@@ -189,22 +183,35 @@ localRegType (LocalReg _ rep) = rep
 -----------------------------------------------------------------------------
 
 -- | Sets of local registers
-type RegSet              =  UniqSet LocalReg
+
+-- These are used for dataflow facts, and a common operation is taking
+-- the union of two RegSets and then asking whether the union is the
+-- same as one of the inputs.  UniqSet isn't good here, because
+-- sizeUniqSet is O(n) whereas Set.size is O(1), so we use ordinary
+-- Sets.
+
+type RegSet              =  Set LocalReg
 emptyRegSet             :: RegSet
+nullRegSet              :: RegSet -> Bool
 elemRegSet              :: LocalReg -> RegSet -> Bool
 extendRegSet            :: RegSet -> LocalReg -> RegSet
 deleteFromRegSet        :: RegSet -> LocalReg -> RegSet
 mkRegSet                :: [LocalReg] -> RegSet
 minusRegSet, plusRegSet, timesRegSet :: RegSet -> RegSet -> RegSet
-
-emptyRegSet      = emptyUniqSet
-elemRegSet       = elementOfUniqSet
-extendRegSet     = addOneToUniqSet
-deleteFromRegSet = delOneFromUniqSet
-mkRegSet         = mkUniqSet
-minusRegSet      = minusUniqSet
-plusRegSet       = unionUniqSets
-timesRegSet      = intersectUniqSets
+sizeRegSet              :: RegSet -> Int
+regSetToList            :: RegSet -> [LocalReg]
+
+emptyRegSet      = Set.empty
+nullRegSet       = Set.null
+elemRegSet       = Set.member
+extendRegSet     = flip Set.insert
+deleteFromRegSet = flip Set.delete
+mkRegSet         = Set.fromList
+minusRegSet      = Set.difference
+plusRegSet       = Set.union
+timesRegSet      = Set.intersection
+sizeRegSet       = Set.size
+regSetToList     = Set.toList
 
 class UserOfLocalRegs a where
   foldRegsUsed :: (b -> LocalReg -> b) -> b -> a -> b
@@ -236,7 +243,7 @@ instance DefinerOfLocalRegs LocalReg where
     foldRegsDefd f z r = f z r
 
 instance UserOfLocalRegs RegSet where
-    foldRegsUsed f = foldUniqSet (flip f)
+    foldRegsUsed f = Set.fold (flip f)
 
 instance UserOfLocalRegs CmmExpr where
   foldRegsUsed f z e = expr z e
@@ -271,49 +278,6 @@ reg `regUsedIn` CmmMachOp _ es   = any (reg `regUsedIn`) es
 _   `regUsedIn` CmmStackSlot _ _ = False
 
 -----------------------------------------------------------------------------
---    Stack slots
------------------------------------------------------------------------------
-
-isStackSlotOf :: CmmExpr -> LocalReg -> Bool
-isStackSlotOf (CmmStackSlot (RegSlot r) _) r' = r == r'
-isStackSlotOf _ _ = False
-
-regSlot :: LocalReg -> CmmExpr
-regSlot r = CmmStackSlot (RegSlot r) (widthInBytes $ typeWidth $ localRegType r)
-
------------------------------------------------------------------------------
---    Stack slot use information for expressions and other types [_$_]
------------------------------------------------------------------------------
-
--- Fold over the area, the offset into the area, and the width of the subarea.
-class UserOfSlots a where
-  foldSlotsUsed :: (b -> SubArea -> b) -> b -> a -> b
-
-class DefinerOfSlots a where
-  foldSlotsDefd :: (b -> SubArea -> b) -> b -> a -> b
-
-instance UserOfSlots CmmExpr where
-  foldSlotsUsed f z e = expr z e
-    where expr z (CmmLit _)          = z
-          expr z (CmmLoad (CmmStackSlot a i) ty) = f z (a, i, widthInBytes $ typeWidth ty)
-          expr z (CmmLoad addr _)    = foldSlotsUsed f z addr
-          expr z (CmmReg _)          = z
-          expr z (CmmMachOp _ exprs) = foldSlotsUsed f z exprs
-          expr z (CmmRegOff _ _)     = z
-          expr z (CmmStackSlot _ _)  = z
-
-instance UserOfSlots a => UserOfSlots [a] where
-  foldSlotsUsed _ set [] = set
-  foldSlotsUsed f set (x:xs) = foldSlotsUsed f (foldSlotsUsed f set x) xs
-
-instance DefinerOfSlots a => DefinerOfSlots [a] where
-  foldSlotsDefd _ set [] = set
-  foldSlotsDefd f set (x:xs) = foldSlotsDefd f (foldSlotsDefd f set x) xs
-
-instance DefinerOfSlots SubArea where
-    foldSlotsDefd f z a = f z a
-
------------------------------------------------------------------------------
 --		Global STG registers
 -----------------------------------------------------------------------------
 
diff --git a/compiler/cmm/CmmInfo.hs b/compiler/cmm/CmmInfo.hs
index fe0c104d1c..a171faa057 100644
--- a/compiler/cmm/CmmInfo.hs
+++ b/compiler/cmm/CmmInfo.hs
@@ -19,6 +19,8 @@ import CmmUtils
 import CLabel
 import SMRep
 import Bitmap
+import Stream (Stream)
+import qualified Stream
 
 import Maybes
 import Constants
@@ -40,10 +42,16 @@ mkEmptyContInfoTable info_lbl
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-cmmToRawCmm :: Platform -> [Old.CmmGroup] -> IO [Old.RawCmmGroup]
+cmmToRawCmm :: Platform -> Stream IO Old.CmmGroup ()
+            -> IO (Stream IO Old.RawCmmGroup ())
 cmmToRawCmm platform cmms
   = do { uniqs <- mkSplitUniqSupply 'i'
-       ; return (initUs_ uniqs (mapM (concatMapM (mkInfoTable platform)) cmms)) }
+       ; let do_one uniqs cmm = do
+                case initUs uniqs $ concatMapM (mkInfoTable platform) cmm of
+                  (b,uniqs') -> return (uniqs',b)
+                  -- NB. strictness fixes a space leak.  DO NOT REMOVE.
+       ; return (Stream.mapAccumL do_one uniqs cmms >> return ())
+       }
 
 -- Make a concrete info table, represented as a list of CmmStatic
 -- (it can't be simply a list of Word, because the SRT field is
@@ -82,7 +90,7 @@ mkInfoTable :: Platform -> CmmDecl -> UniqSM [RawCmmDecl]
 mkInfoTable _ (CmmData sec dat) 
   = return [CmmData sec dat]
 
-mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
+mkInfoTable platform (CmmProc info entry_label blocks)
   | CmmNonInfoTable <- info   -- Code without an info table.  Easy.
   = return [CmmProc Nothing entry_label blocks]
                                
@@ -91,7 +99,8 @@ mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
        ; return (top_decls  ++
                  mkInfoTableAndCode info_lbl info_cts
                                     entry_label blocks) }
-  | otherwise = panic "mkInfoTable"  -- Patern match overlap check not clever enough
+  | otherwise = panic "mkInfoTable"
+                  -- Patern match overlap check not clever enough
 
 -----------------------------------------------------
 type InfoTableContents = ( [CmmLit]	     -- The standard part
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
new file mode 100644
index 0000000000..f0dce4a6a1
--- /dev/null
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -0,0 +1,1048 @@
+{-# LANGUAGE RecordWildCards, GADTs #-}
+module CmmLayoutStack (
+       cmmLayoutStack, setInfoTableStackMap, cmmSink
+  ) where
+
+import StgCmmUtils      ( callerSaveVolatileRegs ) -- XXX
+import StgCmmForeign    ( saveThreadState, loadThreadState ) -- XXX
+
+import Cmm
+import BlockId
+import CLabel
+import CmmUtils
+import MkGraph
+import Module
+import ForeignCall
+import CmmLive
+import CmmProcPoint
+import SMRep
+import Hoopl hiding ((<*>), mkLast, mkMiddle)
+import Constants
+import UniqSupply
+import Maybes
+import UniqFM
+import Util
+
+import FastString
+import Outputable
+import Data.Map (Map)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import Control.Monad.Fix
+import Data.Array as Array
+import Data.Bits
+import Data.List (nub, partition)
+import Control.Monad (liftM)
+
+#include "HsVersions.h"
+
+
+data StackSlot = Occupied | Empty
+     -- Occupied: a return address or part of an update frame
+
+instance Outputable StackSlot where
+  ppr Occupied = ptext (sLit "XXX")
+  ppr Empty    = ptext (sLit "---")
+
+-- All stack locations are expressed as positive byte offsets from the
+-- "base", which is defined to be the address above the return address
+-- on the stack on entry to this CmmProc.
+--
+-- Lower addresses have higher StackLocs.
+--
+type StackLoc = ByteOff
+
+{-
+ A StackMap describes the stack at any given point.  At a continuation
+ it has a particular layout, like this:
+
+         |             | <- base
+         |-------------|
+         |     ret0    | <- base + 8
+         |-------------|
+         .  upd frame  . <- base + sm_ret_off
+         |-------------|
+         |             |
+         .    vars     .
+         . (live/dead) .
+         |             | <- base + sm_sp - sm_args
+         |-------------|
+         |    ret1     |
+         .  ret vals   . <- base + sm_sp    (<--- Sp points here)
+         |-------------|
+
+Why do we include the final return address (ret0) in our stack map?  I
+have absolutely no idea, but it seems to be done that way consistently
+in the rest of the code generator, so I played along here. --SDM
+
+Note that we will be constructing an info table for the continuation
+(ret1), which needs to describe the stack down to, but not including,
+the update frame (or ret0, if there is no update frame).
+-}
+
+data StackMap = StackMap
+ {  sm_sp   :: StackLoc
+       -- ^ the offset of Sp relative to the base on entry
+       -- to this block.
+ ,  sm_args :: ByteOff
+       -- ^ the number of bytes of arguments in the area for this block
+       -- Defn: the offset of young(L) relative to the base is given by
+       -- (sm_sp - sm_args) of the StackMap for block L.
+ ,  sm_ret_off :: ByteOff
+       -- ^ Number of words of stack that we do not describe with an info
+       -- table, because it contains an update frame.
+ ,  sm_regs :: UniqFM (LocalReg,StackLoc)
+       -- ^ regs on the stack
+ }
+
+instance Outputable StackMap where
+  ppr StackMap{..} =
+     text "Sp = " <> int sm_sp $$
+     text "sm_args = " <> int sm_args $$
+     text "sm_ret_off = " <> int sm_ret_off $$
+     text "sm_regs = " <> ppr (eltsUFM sm_regs)
+
+
+cmmLayoutStack :: ProcPointSet -> ByteOff -> CmmGraph
+               -> UniqSM (CmmGraph, BlockEnv StackMap)
+cmmLayoutStack procpoints entry_args
+               graph0@(CmmGraph { g_entry = entry })
+  = do
+    pprTrace "cmmLayoutStack" (ppr entry_args) $ return ()
+    (graph, liveness) <- removeDeadAssignments graph0
+    pprTrace "liveness" (ppr liveness) $ return ()
+    let blocks = postorderDfs graph
+
+    (final_stackmaps, final_high_sp, new_blocks) <-
+          mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
+            layout procpoints liveness entry entry_args
+                   rec_stackmaps rec_high_sp blocks
+
+    new_blocks' <- mapM lowerSafeForeignCall new_blocks
+
+    pprTrace ("Sp HWM") (ppr final_high_sp) $
+       return (ofBlockList entry new_blocks', final_stackmaps)
+
+
+
+layout :: BlockSet                      -- proc points
+       -> BlockEnv CmmLive              -- liveness
+       -> BlockId                       -- entry
+       -> ByteOff                       -- stack args on entry
+
+       -> BlockEnv StackMap             -- [final] stack maps
+       -> ByteOff                       -- [final] Sp high water mark
+
+       -> [CmmBlock]                    -- [in] blocks
+
+       -> UniqSM
+          ( BlockEnv StackMap           -- [out] stack maps
+          , ByteOff                     -- [out] Sp high water mark
+          , [CmmBlock]                  -- [out] new blocks
+          )
+
+layout procpoints liveness entry entry_args final_stackmaps final_hwm blocks
+  = go blocks init_stackmap entry_args []
+  where
+    (updfr, cont_info)  = collectContInfo blocks
+
+    init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
+                                               , sm_args = entry_args
+                                               , sm_ret_off = updfr
+                                               , sm_regs = emptyUFM
+                                               }
+
+    go [] acc_stackmaps acc_hwm acc_blocks
+      = return (acc_stackmaps, acc_hwm, acc_blocks)
+
+    go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
+      = do
+       let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
+    
+       let stack0@StackMap { sm_sp = sp0 }
+               = mapFindWithDefault
+                     (pprPanic "no stack map for" (ppr entry_lbl))
+                     entry_lbl acc_stackmaps
+    
+       pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
+    
+       -- (a) Update the stack map to include the effects of
+       --     assignments in this block
+       let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
+    
+       -- (b) Insert assignments to reload all the live variables if this
+       --     block is a proc point
+       let middle1 = if entry_lbl `setMember` procpoints
+                        then foldr blockCons middle0 (insertReloads stack0)
+                        else middle0
+    
+       -- (c) Look at the last node and if we are making a call or
+       --     jumping to a proc point, we must save the live
+       --     variables, adjust Sp, and construct the StackMaps for
+       --     each of the successor blocks.  See handleLastNode for
+       --     details.
+       (middle2, sp_off, last1, fixup_blocks, out)
+           <- handleLastNode procpoints liveness cont_info
+                             acc_stackmaps stack1 middle0 last0
+    
+       pprTrace "layout(out)" (ppr out) $ return ()
+
+       -- (d) Manifest Sp: run over the nodes in the block and replace
+       --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
+       --
+       -- our block:
+       --    middle1          -- the original middle nodes
+       --    middle2          -- live variable saves from handleLastNode
+       --    Sp = Sp + sp_off -- Sp adjustment goes here
+       --    last1            -- the last node
+       --
+       let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
+
+           sp_high = final_hwm - entry_args
+              -- The stack check value is adjusted by the Sp offset on
+              -- entry to the proc, which is entry_args.  We are
+              -- assuming that we only do a stack check at the
+              -- beginning of a proc, and we don't modify Sp before the
+              -- check.
+
+           final_blocks = manifestSp final_stackmaps stack0 sp0 sp_high entry0
+                              middle_pre sp_off last1 fixup_blocks
+
+           acc_stackmaps' = mapUnion acc_stackmaps out
+
+           hwm' = maximum (acc_hwm : (sp0 - sp_off) : map sm_sp (mapElems out))
+
+       go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
+
+
+-- -----------------------------------------------------------------------------
+
+-- This doesn't seem right somehow.  We need to find out whether this
+-- proc will push some update frame material at some point, so that we
+-- can avoid using that area of the stack for spilling.  The
+-- updfr_space field of the CmmProc *should* tell us, but it doesn't
+-- (I think maybe it gets filled in later when we do proc-point
+-- splitting).
+--
+-- So we'll just take the max of all the cml_ret_offs.  This could be
+-- unnecessarily pessimistic, but probably not in the code we
+-- generate.
+
+collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
+collectContInfo blocks
+  = (maximum ret_offs, mapFromList (catMaybes mb_argss))
+ where
+  (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
+
+  get_cont b =
+     case lastNode b of
+        CmmCall { cml_cont = Just l, .. }
+           -> (Just (l, cml_ret_args), cml_ret_off)
+        CmmForeignCall { .. }
+           -> (Just (succ, 0), updfr) -- ??
+        _other -> (Nothing, 0)
+
+
+-- -----------------------------------------------------------------------------
+-- Updating the StackMap from middle nodes
+
+-- Look for loads from stack slots, and update the StackMap.  This is
+-- purely for optimisation reasons, so that we can avoid saving a
+-- variable back to a different stack slot if it is already on the
+-- stack.
+--
+-- This happens a lot: for example when function arguments are passed
+-- on the stack and need to be immediately saved across a call, we
+-- want to just leave them where they are on the stack.
+--
+procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
+procMiddle stackmaps node sm
+  = case node of
+     CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
+       -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
+        where loc = getStackLoc area off stackmaps
+     CmmAssign (CmmLocal r) _other
+       -> sm { sm_regs = delFromUFM (sm_regs sm) r }
+     _other
+       -> sm
+
+getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
+getStackLoc Old       n _         = n
+getStackLoc (Young l) n stackmaps =
+  case mapLookup l stackmaps of
+    Nothing -> pprPanic "getStackLoc" (ppr l)
+    Just sm -> sm_sp sm - sm_args sm + n
+
+
+-- -----------------------------------------------------------------------------
+-- Handling stack allocation for a last node
+
+-- We take a single last node and turn it into:
+--
+--    C1 (some statements)
+--    Sp = Sp + N
+--    C2 (some more statements)
+--    call f()          -- the actual last node
+--
+-- plus possibly some more blocks (we may have to add some fixup code
+-- between the last node and the continuation).
+--
+-- C1: is the code for saving the variables across this last node onto
+-- the stack, if the continuation is a call or jumps to a proc point.
+--
+-- C2: if the last node is a safe foreign call, we have to inject some
+-- extra code that goes *after* the Sp adjustment.
+
+handleLastNode
+   :: ProcPointSet -> BlockEnv CmmLive -> BlockEnv ByteOff
+   -> BlockEnv StackMap -> StackMap
+   -> Block CmmNode O O
+   -> CmmNode O C
+   -> UniqSM
+      ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
+      , ByteOff            -- amount to adjust Sp
+      , CmmNode O C        -- new last node
+      , [CmmBlock]         -- new blocks
+      , BlockEnv StackMap  -- stackmaps for the continuations
+      )
+
+handleLastNode procpoints liveness cont_info stackmaps
+               stack0@StackMap { sm_sp = sp0 } middle last
+ = case last of
+    --  At each return / tail call,
+    --  adjust Sp to point to the last argument pushed, which
+    --  is cml_args, after popping any other junk from the stack.
+    CmmCall{ cml_cont = Nothing, .. } -> do
+      let sp_off = sp0 - cml_args
+      return ([], sp_off, last, [], mapEmpty)
+
+    --  At each CmmCall with a continuation:
+    CmmCall{ cml_cont = Just cont_lbl, .. } ->
+       return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
+
+    CmmForeignCall{ succ = cont_lbl, .. } -> do
+       return $ lastCall cont_lbl wORD_SIZE wORD_SIZE (sm_ret_off stack0)
+            -- one word each for args and results: the return address
+
+    CmmBranch{..}     ->  handleProcPoints
+    CmmCondBranch{..} ->  handleProcPoints
+    CmmSwitch{..}     ->  handleProcPoints
+
+  where
+     -- Calls and ForeignCalls are handled the same way:
+     lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
+              -> ( [CmmNode O O]
+                 , ByteOff
+                 , CmmNode O C
+                 , [CmmBlock]
+                 , BlockEnv StackMap
+                 )
+     lastCall lbl cml_args cml_ret_args cml_ret_off
+      =  ( assignments
+         , spOffsetForCall sp0 cont_stack cml_args
+         , last
+         , [] -- no new blocks
+         , mapSingleton lbl cont_stack )
+      where
+         (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
+
+
+     prepareStack lbl cml_ret_args cml_ret_off
+       | Just cont_stack <- mapLookup lbl stackmaps
+             -- If we have already seen this continuation before, then
+             -- we just have to make the stack look the same:
+       = (fixupStack stack0 cont_stack, cont_stack)
+             -- Otherwise, we have to allocate the stack frame
+       | otherwise
+       = (save_assignments, new_cont_stack)
+       where
+        (new_cont_stack, save_assignments)
+           = setupStackFrame lbl liveness cml_ret_off cml_ret_args stack0
+
+
+     -- For other last nodes (branches), if any of the targets is a
+     -- proc point, we have to set up the stack to match what the proc
+     -- point is expecting.
+     --
+     handleProcPoints :: UniqSM ( [CmmNode O O]
+                                , ByteOff
+                                , CmmNode O C
+                                , [CmmBlock]
+                                , BlockEnv StackMap )
+
+     handleProcPoints
+         -- Note [diamond proc point]
+       | Just l <- futureContinuation middle
+       , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
+       = do
+         let cont_args = mapFindWithDefault 0 l cont_info
+             (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
+             out = mapFromList [ (l', cont_stack)
+                               | l' <- successors last ]
+         return ( assigs
+                , spOffsetForCall sp0 cont_stack wORD_SIZE
+                , last
+                , []
+                , out)
+
+        | otherwise = do
+          pps <- mapM handleProcPoint (successors last)
+          let lbl_map :: LabelMap Label
+              lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
+              fix_lbl l = mapLookup l lbl_map `orElse` l
+          return ( []
+                 , 0
+                 , mapSuccessors fix_lbl last
+                 , concat [ blk | (_,_,_,blk) <- pps ]
+                 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
+
+     -- For each proc point that is a successor of this block
+     --   (a) if the proc point already has a stackmap, we need to
+     --       shuffle the current stack to make it look the same.
+     --       We have to insert a new block to make this happen.
+     --   (b) otherwise, call "allocate live stack0" to make the
+     --       stack map for the proc point
+     handleProcPoint :: BlockId
+                     -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
+     handleProcPoint l
+        | not (l `setMember` procpoints) = return (l, l, stack0, [])
+        | otherwise = do
+           tmp_lbl <- liftM mkBlockId $ getUniqueM
+           let
+               (stack2, assigs) =
+                  case mapLookup l stackmaps of
+                    Just pp_sm -> (pp_sm, fixupStack stack0 pp_sm)
+                    Nothing    ->
+                      pprTrace "first visit to proc point"
+                                   (ppr l <+> ppr stack1) $
+                      (stack1, assigs)
+                      where
+                       cont_args = mapFindWithDefault 0 l cont_info
+                       (stack1, assigs) =
+                           setupStackFrame l liveness (sm_ret_off stack0)
+                                                       cont_args stack0
+
+               sp_off = sp0 - sm_sp stack2
+
+               block = blockJoin (CmmEntry tmp_lbl)
+                                 (maybeAddSpAdj sp_off (blockFromList assigs))
+                                 (CmmBranch l)
+           --
+           return (l, tmp_lbl, stack2, [block])
+
+
+  
+-- Sp is currently pointing to current_sp,
+-- we want it to point to
+--    (sm_sp cont_stack - sm_args cont_stack + args)
+-- so the difference is
+--    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
+spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
+spOffsetForCall current_sp cont_stack args
+  = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
+
+
+-- | create a sequence of assignments to establish the new StackMap,
+-- given the old StackMap.
+fixupStack :: StackMap -> StackMap -> [CmmNode O O]
+fixupStack old_stack new_stack = concatMap move new_locs
+ where
+     old_map :: Map LocalReg ByteOff
+     old_map  = Map.fromList (stackSlotRegs old_stack)
+     new_locs = stackSlotRegs new_stack
+
+     move (r,n)
+       | Just m <- Map.lookup r old_map, n == m = []
+       | otherwise = [CmmStore (CmmStackSlot Old n)
+                               (CmmReg (CmmLocal r))]
+
+
+
+setupStackFrame
+             :: BlockId                 -- label of continuation
+             -> BlockEnv CmmLive        -- liveness
+             -> ByteOff      -- updfr
+             -> ByteOff      -- bytes of return values on stack
+             -> StackMap     -- current StackMap
+             -> (StackMap, [CmmNode O O])
+
+setupStackFrame lbl liveness updfr_off ret_args stack0
+  = (cont_stack, assignments)
+  where
+      -- get the set of LocalRegs live in the continuation
+      live = mapFindWithDefault Set.empty lbl liveness
+
+      -- the stack from the base to updfr_off is off-limits.
+      -- our new stack frame contains:
+      --   * saved live variables
+      --   * the return address [young(C) + 8]
+      --   * the args for the call,
+      --     which are replaced by the return values at the return
+      --     point.
+
+      -- everything up to updfr_off is off-limits
+      -- stack1 contains updfr_off, plus everything we need to save
+      (stack1, assignments) = allocate updfr_off live stack0
+
+      -- And the Sp at the continuation is:
+      --   sm_sp stack1 + ret_args
+      cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
+                         , sm_args = ret_args
+                         , sm_ret_off = updfr_off
+                         }
+
+
+-- -----------------------------------------------------------------------------
+-- Note [diamond proc point]
+--
+-- This special case looks for the pattern we get from a typical
+-- tagged case expression:
+--
+--    Sp[young(L1)] = L1
+--    if (R1 & 7) != 0 goto L1 else goto L2
+--  L2:
+--    call [R1] returns to L1
+--  L1: live: {y}
+--    x = R1
+--
+-- If we let the generic case handle this, we get
+--
+--    Sp[-16] = L1
+--    if (R1 & 7) != 0 goto L1a else goto L2
+--  L2:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    call [R1] returns to L1
+--  L1a:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    goto L1
+--  L1:
+--    x = R1
+--
+-- The code for saving the live vars is duplicated in each branch, and
+-- furthermore there is an extra jump in the fast path (assuming L1 is
+-- a proc point, which it probably is if there is a heap check).
+--
+-- So to fix this we want to set up the stack frame before the
+-- conditional jump.  How do we know when to do this, and when it is
+-- safe?  The basic idea is, when we see the assignment
+-- 
+--   Sp[young(L)] = L
+-- 
+-- we know that
+--   * we are definitely heading for L
+--   * there can be no more reads from another stack area, because young(L)
+--     overlaps with it.
+--
+-- We don't necessarily know that everything live at L is live now
+-- (some might be assigned between here and the jump to L).  So we
+-- simplify and only do the optimisation when we see
+--
+--   (1) a block containing an assignment of a return address L
+--   (2) ending in a branch where one (and only) continuation goes to L,
+--       and no other continuations go to proc points.
+--
+-- then we allocate the stack frame for L at the end of the block,
+-- before the branch.
+--
+-- We could generalise (2), but that would make it a bit more
+-- complicated to handle, and this currently catches the common case.
+
+futureContinuation :: Block CmmNode O O -> Maybe BlockId
+futureContinuation middle = foldBlockNodesB f middle Nothing
+   where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
+         f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
+               = Just l
+         f _ r = r
+
+-- -----------------------------------------------------------------------------
+-- Saving live registers
+
+-- | Given a set of live registers and a StackMap, save all the registers
+-- on the stack and return the new StackMap and the assignments to do
+-- the saving.
+--
+allocate :: ByteOff -> RegSet -> StackMap -> (StackMap, [CmmNode O O])
+allocate ret_off live stackmap@StackMap{ sm_sp = sp0
+                                       , sm_regs = regs0 }
+ =
+  pprTrace "allocate" (ppr live $$ ppr stackmap) $
+
+   -- we only have to save regs that are not already in a slot
+   let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
+       regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
+   in
+
+   -- make a map of the stack
+   let stack = reverse $ Array.elems $
+               accumArray (\_ x -> x) Empty (1, toWords (max sp0 ret_off)) $
+                 ret_words ++ live_words
+            where ret_words =
+                   [ (x, Occupied)
+                   | x <- [ 1 .. toWords ret_off] ]
+                  live_words =
+                   [ (toWords x, Occupied)
+                   | (r,off) <- eltsUFM regs1,
+                     let w = localRegBytes r,
+                     x <- [ off, off-wORD_SIZE .. off - w + 1] ]
+   in
+
+   -- Pass over the stack: find slots to save all the new live variables,
+   -- choosing the oldest slots first (hence a foldr).
+   let
+       save slot ([], stack, n, assigs, regs) -- no more regs to save
+          = ([], slot:stack, n `plusW` 1, assigs, regs)
+       save slot (to_save, stack, n, assigs, regs)
+          = case slot of
+               Occupied ->  (to_save, Occupied:stack, n `plusW` 1, assigs, regs)
+               Empty
+                 | Just (stack', r, to_save') <-
+                       select_save to_save (slot:stack)
+                 -> let assig = CmmStore (CmmStackSlot Old n')
+                                         (CmmReg (CmmLocal r))
+                        n' = n `plusW` 1
+                   in
+                        (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
+
+                 | otherwise
+                 -> (to_save, slot:stack, n `plusW` 1, assigs, regs)
+
+       -- we should do better here: right now we'll fit the smallest first,
+       -- but it would make more sense to fit the biggest first.
+       select_save :: [LocalReg] -> [StackSlot]
+                   -> Maybe ([StackSlot], LocalReg, [LocalReg])
+       select_save regs stack = go regs []
+         where go []     _no_fit = Nothing
+               go (r:rs) no_fit
+                 | Just rest <- dropEmpty words stack
+                 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
+                 | otherwise
+                 = go rs (r:no_fit)
+                 where words = localRegWords r
+
+       -- fill in empty slots as much as possible
+       (still_to_save, save_stack, n, save_assigs, save_regs)
+          = foldr save (to_save, [], 0, [], []) stack
+
+       -- push any remaining live vars on the stack
+       (push_sp, push_assigs, push_regs)
+          = foldr push (n, [], []) still_to_save
+          where
+              push r (n, assigs, regs)
+                = (n', assig : assigs, (r,(r,n')) : regs)
+                where
+                  n' = n + localRegBytes r
+                  assig = CmmStore (CmmStackSlot Old n')
+                                   (CmmReg (CmmLocal r))
+
+       trim_sp
+          | not (null push_regs) = push_sp
+          | otherwise
+          = n `plusW` (- length (takeWhile isEmpty save_stack))
+
+       final_regs = regs1 `addListToUFM` push_regs
+                          `addListToUFM` save_regs
+
+   in
+  -- XXX should be an assert
+   if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
+
+   if (trim_sp .&. (wORD_SIZE - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
+
+   ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
+   , push_assigs ++ save_assigs )
+
+
+-- -----------------------------------------------------------------------------
+-- Manifesting Sp
+
+-- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
+-- block looks like this:
+--
+--    middle_pre       -- the middle nodes
+--    Sp = Sp + sp_off -- Sp adjustment goes here
+--    last             -- the last node
+--
+-- And we have some extra blocks too (that don't contain Sp adjustments)
+--
+-- The adjustment for middle_pre will be different from that for
+-- middle_post, because the Sp adjustment intervenes.
+--
+manifestSp
+   :: BlockEnv StackMap  -- StackMaps for other blocks
+   -> StackMap           -- StackMap for this block
+   -> ByteOff            -- Sp on entry to the block
+   -> ByteOff            -- SpHigh
+   -> CmmNode C O        -- first node
+   -> [CmmNode O O]      -- middle
+   -> ByteOff            -- sp_off
+   -> CmmNode O C        -- last node
+   -> [CmmBlock]         -- new blocks
+   -> [CmmBlock]         -- final blocks with Sp manifest
+
+manifestSp stackmaps stack0 sp0 sp_high
+           first middle_pre sp_off last fixup_blocks
+  = final_block : fixup_blocks'
+  where
+    area_off = getAreaOff stackmaps
+
+    adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
+    adj_pre_sp  = mapExpDeep (areaToSp sp0            sp_high area_off)
+    adj_post_sp = mapExpDeep (areaToSp (sp0 - sp_off) sp_high area_off)
+
+    final_middle = maybeAddSpAdj sp_off $
+                   blockFromList $
+                   map adj_pre_sp $
+                   elimStackStores stack0 stackmaps area_off $
+                   middle_pre
+
+    final_last    = optStackCheck (adj_post_sp last)
+
+    final_block   = blockJoin first final_middle final_last
+
+    fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
+
+
+getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
+getAreaOff _ Old = 0
+getAreaOff stackmaps (Young l) =
+  case mapLookup l stackmaps of
+    Just sm -> sm_sp sm - sm_args sm
+    Nothing -> pprPanic "getAreaOff" (ppr l)
+
+
+maybeAddSpAdj :: ByteOff -> Block CmmNode O O -> Block CmmNode O O
+maybeAddSpAdj 0      block = block
+maybeAddSpAdj sp_off block
+   = block `blockSnoc` CmmAssign spReg (cmmOffset (CmmReg spReg) sp_off)
+
+
+{-
+Sp(L) is the Sp offset on entry to block L relative to the base of the
+OLD area.
+
+SpArgs(L) is the size of the young area for L, i.e. the number of
+arguments.
+
+ - in block L, each reference to [old + N] turns into
+   [Sp + Sp(L) - N]
+
+ - in block L, each reference to [young(L') + N] turns into
+   [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
+
+ - be careful with the last node of each block: Sp has already been adjusted
+   to be Sp + Sp(L) - Sp(L')
+-}
+
+areaToSp :: ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
+areaToSp sp_old _sp_hwm area_off (CmmStackSlot area n) =
+  cmmOffset (CmmReg spReg) (sp_old - area_off area - n)
+areaToSp _ sp_hwm _ (CmmLit CmmHighStackMark) = CmmLit (mkIntCLit sp_hwm)
+areaToSp _ _ _ (CmmMachOp (MO_U_Lt _)  -- Note [null stack check]
+                      [CmmMachOp (MO_Sub _)
+                              [ CmmReg (CmmGlobal Sp)
+                              , CmmLit (CmmInt 0 _)],
+                       CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
+areaToSp _ _ _ other = other
+
+-- -----------------------------------------------------------------------------
+-- Note [null stack check]
+--
+-- If the high-water Sp is zero, then we end up with
+--
+--   if (Sp - 0 < SpLim) then .. else ..
+--
+-- and possibly some dead code for the failure case.  Optimising this
+-- away depends on knowing that SpLim <= Sp, so it is really the job
+-- of the stack layout algorithm, hence we do it now.  This is also
+-- convenient because control-flow optimisation later will drop the
+-- dead code.
+
+optStackCheck :: CmmNode O C -> CmmNode O C
+optStackCheck n = -- Note [null stack check]
+ case n of
+   CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
+   other -> other
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Eliminate stores of the form
+--
+--    Sp[area+n] = r
+--
+-- when we know that r is already in the same slot as Sp[area+n].  We
+-- could do this in a later optimisation pass, but that would involve
+-- a separate analysis and we already have the information to hand
+-- here.  It helps clean up some extra stack stores in common cases.
+--
+-- Note that we may have to modify the StackMap as we walk through the
+-- code using procMiddle, since an assignment to a variable in the
+-- StackMap will invalidate its mapping there.
+--
+elimStackStores :: StackMap
+                -> BlockEnv StackMap
+                -> (Area -> ByteOff)
+                -> [CmmNode O O]
+                -> [CmmNode O O]
+elimStackStores stackmap stackmaps area_off nodes
+  = go stackmap nodes
+  where
+    go _stackmap [] = []
+    go stackmap (n:ns)
+     = case n of
+         CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
+            | Just (_,off) <- lookupUFM (sm_regs stackmap) r
+            , area_off area + m == off
+            -> pprTrace "eliminated a node!" (ppr r) $ go stackmap ns
+         _otherwise
+            -> n : go (procMiddle stackmaps n stackmap) ns
+
+
+-- -----------------------------------------------------------------------------
+-- Update info tables to include stack liveness
+
+
+setInfoTableStackMap :: BlockEnv StackMap -> CmmDecl -> CmmDecl
+setInfoTableStackMap stackmaps
+    (CmmProc top_info@TopInfo{..} l g@CmmGraph{g_entry = eid})
+  = CmmProc top_info{ info_tbl = fix_info info_tbl } l g
+  where
+    fix_info info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
+       info_tbl { cit_rep = StackRep (get_liveness eid) }
+    fix_info other = other
+
+    get_liveness :: BlockId -> Liveness
+    get_liveness lbl
+      = case mapLookup lbl stackmaps of
+          Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl)
+          Just sm -> stackMapToLiveness sm
+
+setInfoTableStackMap _ d = d
+
+
+stackMapToLiveness :: StackMap -> Liveness
+stackMapToLiveness StackMap{..} =
+   reverse $ Array.elems $
+        accumArray (\_ x -> x) True (toWords sm_ret_off + 1,
+                                     toWords (sm_sp - sm_args)) live_words
+   where
+     live_words =  [ (toWords off, False)
+                   | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
+
+
+-- -----------------------------------------------------------------------------
+-- Lowering safe foreign calls
+
+{-
+Note [lower safe foreign calls]
+
+We start with
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | r1 = foo(x,y,z) returns to L1
+ '-----------------------
+ L1:
+   R1 = r1 -- copyIn, inserted by mkSafeCall
+   ...
+
+the stack layout algorithm will arrange to save and reload everything
+live across the call.  Our job now is to expand the call so we get
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | SAVE_THREAD_STATE()
+ | token = suspendThread(BaseReg, interruptible)
+ | r = foo(x,y,z)
+ | BaseReg = resumeThread(token)
+ | LOAD_THREAD_STATE()
+ | R1 = r  -- copyOut
+ | jump L1
+ '-----------------------
+ L1:
+   r = R1 -- copyIn, inserted by mkSafeCall
+   ...
+
+Note the copyOut, which saves the results in the places that L1 is
+expecting them (see Note {safe foreign call convention]).
+-}
+
+lowerSafeForeignCall :: CmmBlock -> UniqSM CmmBlock
+lowerSafeForeignCall block
+  | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
+  = do
+    -- Both 'id' and 'new_base' are KindNonPtr because they're
+    -- RTS-only objects and are not subject to garbage collection
+    id <- newTemp bWord
+    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
+    let (caller_save, caller_load) = callerSaveVolatileRegs
+    load_tso <- newTemp gcWord
+    load_stack <- newTemp gcWord
+    let suspend = saveThreadState <*>
+                  caller_save <*>
+                  mkMiddle (callSuspendThread id intrbl)
+        midCall = mkUnsafeCall tgt res args
+        resume  = mkMiddle (callResumeThread new_base id) <*>
+                  -- Assign the result to BaseReg: we
+                  -- might now have a different Capability!
+                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
+                  caller_load <*>
+                  loadThreadState load_tso load_stack
+        -- Note: The successor must be a procpoint, and we have already split,
+        --       so we use a jump, not a branch.
+        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
+
+        (ret_args, copyout) = copyOutOflow NativeReturn Jump (Young succ)
+                                           (map (CmmReg . CmmLocal) res)
+                                           updfr (0, [])
+
+        jump = CmmCall { cml_target   = succLbl
+                       , cml_cont     = Just succ
+                       , cml_args     = widthInBytes wordWidth
+                       , cml_ret_args = ret_args
+                       , cml_ret_off  = updfr }
+
+    graph' <- lgraphOfAGraph $ suspend <*>
+                               midCall <*>
+                               resume  <*>
+                               copyout <*>
+                               mkLast jump
+
+    case toBlockList graph' of
+      [one] -> let (_, middle', last) = blockSplit one
+               in return (blockJoin entry (middle `blockAppend` middle') last)
+      _ -> panic "lowerSafeForeignCall0"
+
+  -- Block doesn't end in a safe foreign call:
+  | otherwise = return block
+
+
+foreignLbl :: FastString -> CmmExpr
+foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
+
+newTemp :: CmmType -> UniqSM LocalReg
+newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
+
+callSuspendThread :: LocalReg -> Bool -> CmmNode O O
+callSuspendThread id intrbl =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "suspendThread"))
+             (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
+       [id] [CmmReg (CmmGlobal BaseReg), CmmLit (mkIntCLit (fromEnum intrbl))]
+
+callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
+callResumeThread new_base id =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "resumeThread"))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint]))
+       [new_base] [CmmReg (CmmLocal id)]
+
+-- -----------------------------------------------------------------------------
+
+plusW :: ByteOff -> WordOff -> ByteOff
+plusW b w = b + w * wORD_SIZE
+
+dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
+dropEmpty 0 ss           = Just ss
+dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
+dropEmpty _ _            = Nothing
+
+isEmpty :: StackSlot -> Bool
+isEmpty Empty = True
+isEmpty _ = False
+
+localRegBytes :: LocalReg -> ByteOff
+localRegBytes r = roundUpToWords (widthInBytes (typeWidth (localRegType r)))
+
+localRegWords :: LocalReg -> WordOff
+localRegWords = toWords . localRegBytes
+
+toWords :: ByteOff -> WordOff
+toWords x = x `quot` wORD_SIZE
+
+
+insertReloads :: StackMap -> [CmmNode O O]
+insertReloads stackmap =
+   [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
+                                     (localRegType r))
+   | (r,sp) <- stackSlotRegs stackmap
+   ]
+
+
+stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
+stackSlotRegs sm = eltsUFM (sm_regs sm)
+
+-- -----------------------------------------------------------------------------
+
+-- If we do this *before* stack layout, we might be able to avoid
+-- saving some things across calls/procpoints.
+--
+-- *but*, that will invalidate the liveness analysis, and we'll have
+-- to re-do it.
+
+cmmSink :: CmmGraph -> UniqSM CmmGraph
+cmmSink graph = do
+  let liveness = cmmLiveness graph
+  return $ cmmSink' liveness graph
+
+cmmSink' :: BlockEnv CmmLive -> CmmGraph -> CmmGraph
+cmmSink' liveness graph
+  = ofBlockList (g_entry graph) $ sink mapEmpty $ postorderDfs graph
+  where
+
+  sink :: BlockEnv [(LocalReg, CmmExpr)] -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    pprTrace "sink" (ppr l) $
+    blockJoin first final_middle last : sink sunk' bs
+    where
+      l = entryLabel b
+      (first, middle, last) = blockSplit b
+      (middle', assigs) = walk (blockToList middle) emptyBlock
+                               (mapFindWithDefault [] l sunk)
+
+      (dropped_last, assigs') = partition (`conflictsWithLast` last) assigs
+
+      final_middle = foldl blockSnoc middle' (toNodes dropped_last)
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filt assigs' (getLive l))
+                             | l <- successors last ]
+           where
+               getLive l = mapFindWithDefault Set.empty l liveness
+               filt as live = [ (r,e) | (r,e) <- as, r `Set.member` live ]
+
+
+walk :: [CmmNode O O] -> Block CmmNode O O -> [(LocalReg, CmmExpr)]
+     -> (Block CmmNode O O, [(LocalReg, CmmExpr)])
+
+walk []     acc as = (acc, as)
+walk (n:ns) acc as
+  | Just a <- collect_it  = walk ns acc (a:as)
+  | otherwise             = walk ns (foldr (flip blockSnoc) acc (n:drop_nodes)) as'
+  where
+    collect_it = case n of
+                   CmmAssign (CmmLocal r) e@(CmmReg (CmmGlobal _)) -> Just (r,e)
+--                   CmmAssign (CmmLocal r) e@(CmmLoad addr _) |
+--                      foldRegsUsed (\b r -> False) True addr -> Just (r,e)
+                   _ -> Nothing
+
+    drop_nodes = toNodes dropped
+    (dropped, as') = partition should_drop as
+       where should_drop a = a `conflicts` n
+
+toNodes :: [(LocalReg,CmmExpr)] -> [CmmNode O O]
+toNodes as = [ CmmAssign (CmmLocal r) rhs | (r,rhs) <- as ]
+
+-- We only sink "r = G" assignments right now, so conflicts is very simple:
+conflicts :: (LocalReg,CmmExpr) -> CmmNode O O -> Bool
+(_, rhs) `conflicts` CmmAssign reg  _  | reg `regUsedIn` rhs = True
+--(r, CmmLoad _ _) `conflicts` CmmStore _ _ = True
+(r, _)   `conflicts` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
+
+conflictsWithLast :: (LocalReg,CmmExpr) -> CmmNode O C -> Bool
+(r, _) `conflictsWithLast` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 01ebac6254..2e24dd7f82 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -1,67 +1,70 @@
 -----------------------------------------------------------------------------
 --
--- (c) The University of Glasgow 2004-2006
+-- (c) The University of Glasgow 2011
 --
 -- CmmLint: checking the correctness of Cmm statements and expressions
 --
 -----------------------------------------------------------------------------
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
+{-# LANGUAGE GADTs #-}
 module CmmLint (
-  cmmLint, cmmLintTop
+    cmmLint, cmmLintGraph
   ) where
 
+import Hoopl
+import Cmm
+import CmmUtils
+import PprCmm ()
 import BlockId
-import OldCmm
-import CLabel
+import FastString
 import Outputable
-import OldPprCmm()
 import Constants
-import FastString
-import Platform
 
 import Data.Maybe
 
+-- Things to check:
+--     - invariant on CmmBlock in CmmExpr (see comment there)
+--     - check for branches to blocks that don't exist
+--     - check types
+
 -- -----------------------------------------------------------------------------
 -- Exported entry points:
 
 cmmLint :: (Outputable d, Outputable h)
-        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+        => GenCmmGroup d h CmmGraph -> Maybe SDoc
+cmmLint tops = runCmmLint (mapM_ lintCmmDecl) tops
 
-cmmLintTop :: (Outputable d, Outputable h)
-           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+cmmLintGraph :: CmmGraph -> Maybe SDoc
+cmmLintGraph g = runCmmLint lintCmmGraph g
 
-runCmmLint :: Outputable a
-           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
-runCmmLint _ l p =
+runCmmLint :: Outputable a => (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint l p =
    case unCL (l p) of
-   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
-                           nest 2 err,
-                           ptext $ sLit ("Program was:"),
-                           nest 2 (ppr p)])
-   Right _  -> Nothing
-
-lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
-lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
-  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
-        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
-	in  mapM_ (lintCmmBlock platform labels) blocks
-
-lintCmmDecl _ (CmmData {})
+     Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                             nest 2 err,
+                             ptext $ sLit ("Program was:"),
+                             nest 2 (ppr p)])
+     Right _  -> Nothing
+
+lintCmmDecl :: GenCmmDecl h i CmmGraph -> CmmLint ()
+lintCmmDecl (CmmProc _ lbl g)
+  = addLintInfo (text "in proc " <> ppr lbl) $ lintCmmGraph g
+lintCmmDecl (CmmData {})
   = return ()
 
-lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
-lintCmmBlock platform labels (BasicBlock id stmts)
-  = addLintInfo (text "in basic block " <> ppr id) $
-	mapM_ (lintCmmStmt platform labels) stmts
+
+lintCmmGraph :: CmmGraph -> CmmLint ()
+lintCmmGraph g = mapM_ (lintCmmBlock labels) blocks
+  where
+       blocks = toBlockList g
+       labels = setFromList (map entryLabel blocks)
+
+
+lintCmmBlock :: BlockSet -> CmmBlock -> CmmLint ()
+lintCmmBlock labels block
+  = addLintInfo (text "in basic block " <> ppr (entryLabel block)) $ do
+        let (_, middle, last) = blockSplit block
+        mapM_ lintCmmMiddle (blockToList middle)
+        lintCmmLast labels last
 
 -- -----------------------------------------------------------------------------
 -- lintCmmExpr
@@ -69,24 +72,24 @@ lintCmmBlock platform labels (BasicBlock id stmts)
 -- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
 -- byte/word mismatches.
 
-lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
-lintCmmExpr platform (CmmLoad expr rep) = do
-  _ <- lintCmmExpr platform expr
+lintCmmExpr :: CmmExpr -> CmmLint CmmType
+lintCmmExpr (CmmLoad expr rep) = do
+  _ <- lintCmmExpr expr
   -- Disabled, if we have the inlining phase before the lint phase,
   -- we can have funny offsets due to pointer tagging. -- EZY
   -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
   --   cmmCheckWordAddress expr
   return rep
-lintCmmExpr platform expr@(CmmMachOp op args) = do
-  tys <- mapM (lintCmmExpr platform) args
+lintCmmExpr expr@(CmmMachOp op args) = do
+  tys <- mapM lintCmmExpr args
   if map (typeWidth . cmmExprType) args == machOpArgReps op
-  	then cmmCheckMachOp op args tys
-	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
-lintCmmExpr platform (CmmRegOff reg offset)
-  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
-		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+        then cmmCheckMachOp op args tys
+        else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr (CmmRegOff reg offset)
+  = lintCmmExpr (CmmMachOp (MO_Add rep)
+                [CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
   where rep = typeWidth (cmmRegType reg)
-lintCmmExpr _ expr =
+lintCmmExpr expr =
   return (cmmExprType expr)
 
 -- Check for some common byte/word mismatches (eg. Sp + 1)
@@ -119,43 +122,61 @@ notNodeReg :: CmmExpr -> Bool
 notNodeReg (CmmReg reg) | reg == nodeReg = False
 notNodeReg _                             = True
 
-lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
-lintCmmStmt platform labels = lint
-    where lint (CmmNop) = return ()
-          lint (CmmComment {}) = return ()
-          lint stmt@(CmmAssign reg expr) = do
-            erep <- lintCmmExpr platform expr
-	    let reg_ty = cmmRegType reg
+lintCmmMiddle :: CmmNode O O -> CmmLint ()
+lintCmmMiddle node = case node of
+  CmmComment _ -> return ()
+
+  CmmAssign reg expr -> do
+            erep <- lintCmmExpr expr
+            let reg_ty = cmmRegType reg
             if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
                 then return ()
-                else cmmLintAssignErr stmt erep reg_ty
-          lint (CmmStore l r) = do
-            _ <- lintCmmExpr platform l
-            _ <- lintCmmExpr platform r
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+
+  CmmStore l r -> do
+            _ <- lintCmmExpr l
+            _ <- lintCmmExpr r
             return ()
-          lint (CmmCall target _res args _) =
-              do lintTarget platform labels target
-                 mapM_ (lintCmmExpr platform . hintlessCmm) args
-          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
-          lint (CmmSwitch e branches) = do
+
+  CmmUnsafeForeignCall target _formals actuals -> do
+            lintTarget target
+            mapM_ lintCmmExpr actuals
+
+
+lintCmmLast :: BlockSet -> CmmNode O C -> CmmLint ()
+lintCmmLast labels node = case node of
+  CmmBranch id -> checkTarget id
+
+  CmmCondBranch e t f -> do
+            mapM_ checkTarget [t,f]
+            _ <- lintCmmExpr e
+            checkCond e
+
+  CmmSwitch e branches -> do
             mapM_ checkTarget $ catMaybes branches
-            erep <- lintCmmExpr platform e
+            erep <- lintCmmExpr e
             if (erep `cmmEqType_ignoring_ptrhood` bWord)
               then return ()
-              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
-                               text " :: " <> ppr erep)
-          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
-          lint (CmmReturn) = return ()
-          lint (CmmBranch id) = checkTarget id
-          checkTarget id = if setMember id labels then return ()
-                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
-
-lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
-lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
-                                                return ()
-lintTarget _        _      (CmmPrim _ Nothing) = return ()
-lintTarget platform labels (CmmPrim _ (Just stmts))
-    = mapM_ (lintCmmStmt platform labels) stmts
+              else cmmLintErr (text "switch scrutinee is not a word: " <>
+                               ppr e <> text " :: " <> ppr erep)
+
+  CmmCall { cml_target = target, cml_cont = cont } -> do
+          _ <- lintCmmExpr target
+          maybe (return ()) checkTarget cont
+
+  CmmForeignCall tgt _ args succ _ _ -> do
+          lintTarget tgt
+          mapM_ lintCmmExpr args
+          checkTarget succ
+ where
+  checkTarget id
+     | setMember id labels = return ()
+     | otherwise = cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+
+lintTarget :: ForeignTarget -> CmmLint ()
+lintTarget (ForeignTarget e _) = lintCmmExpr e >> return ()
+lintTarget (PrimTarget {})     = return ()
 
 
 checkCond :: CmmExpr -> CmmLint ()
@@ -163,7 +184,7 @@ checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
 checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
 checkCond expr
     = cmmLintErr (hang (text "expression is not a conditional:") 2
-                       (ppr expr))
+                         (ppr expr))
 
 -- -----------------------------------------------------------------------------
 -- CmmLint monad
@@ -173,37 +194,36 @@ checkCond expr
 newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
 
 instance Monad CmmLint where
-  CmmLint m >>= k = CmmLint $ case m of 
-				Left e -> Left e
-				Right a -> unCL (k a)
+  CmmLint m >>= k = CmmLint $ case m of
+                                Left e -> Left e
+                                Right a -> unCL (k a)
   return a = CmmLint (Right a)
 
 cmmLintErr :: SDoc -> CmmLint a
 cmmLintErr msg = CmmLint (Left msg)
 
 addLintInfo :: SDoc -> CmmLint a -> CmmLint a
-addLintInfo info thing = CmmLint $ 
+addLintInfo info thing = CmmLint $
    case unCL thing of
-	Left err -> Left (hang info 2 err)
-	Right a  -> Right a
+        Left err -> Left (hang info 2 err)
+        Right a  -> Right a
 
 cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
 cmmLintMachOpErr expr argsRep opExpectsRep
-     = cmmLintErr (text "in MachOp application: " $$ 
-					nest 2 (ppr expr) $$
-				        (text "op is expecting: " <+> ppr opExpectsRep) $$
-					(text "arguments provide: " <+> ppr argsRep))
+     = cmmLintErr (text "in MachOp application: " $$
+                   nest 2 (ppr  expr) $$
+                      (text "op is expecting: " <+> ppr opExpectsRep) $$
+                      (text "arguments provide: " <+> ppr argsRep))
 
-cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr :: CmmNode e x -> CmmType -> CmmType -> CmmLint a
 cmmLintAssignErr stmt e_ty r_ty
-  = cmmLintErr (text "in assignment: " $$ 
-		nest 2 (vcat [ppr stmt, 
-			      text "Reg ty:" <+> ppr r_ty,
-			      text "Rhs ty:" <+> ppr e_ty]))
-			 
-					
+  = cmmLintErr (text "in assignment: " $$
+                nest 2 (vcat [ppr stmt,
+                              text "Reg ty:" <+> ppr r_ty,
+                              text "Rhs ty:" <+> ppr e_ty]))
+
 
 cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
 cmmLintDubiousWordOffset expr
    = cmmLintErr (text "offset is not a multiple of words: " $$
-			nest 2 (ppr expr))
+                 nest 2 (ppr expr))
diff --git a/compiler/cmm/CmmLive.hs b/compiler/cmm/CmmLive.hs
index 9a5bb2d5ae..f0163fefc4 100644
--- a/compiler/cmm/CmmLive.hs
+++ b/compiler/cmm/CmmLive.hs
@@ -11,17 +11,15 @@ module CmmLive
     )
 where
 
+import UniqSupply
 import BlockId
 import Cmm
 import CmmUtils
-import Control.Monad
-import OptimizationFuel
 import PprCmmExpr ()
 
-import Compiler.Hoopl
+import Hoopl
 import Maybes
 import Outputable
-import UniqSet
 
 -----------------------------------------------------------------------------
 -- Calculating what variables are live on entry to a basic block
@@ -33,8 +31,10 @@ type CmmLive = RegSet
 -- | The dataflow lattice
 liveLattice :: DataflowLattice CmmLive
 liveLattice = DataflowLattice "live LocalReg's" emptyRegSet add
-    where add _ (OldFact old) (NewFact new) = case unionUniqSets old new of
-            join -> (changeIf $ sizeUniqSet join > sizeUniqSet old, join)
+    where add _ (OldFact old) (NewFact new) =
+               (changeIf $ sizeRegSet join > sizeRegSet old, join)
+              where !join = plusRegSet old new
+
 
 -- | A mapping from block labels to the variables live on entry
 type BlockEntryLiveness = BlockEnv CmmLive
@@ -43,16 +43,17 @@ type BlockEntryLiveness = BlockEnv CmmLive
 -- | Calculated liveness info for a CmmGraph
 -----------------------------------------------------------------------------
 
-cmmLiveness :: CmmGraph -> FuelUniqSM BlockEntryLiveness
+cmmLiveness :: CmmGraph -> BlockEntryLiveness
 cmmLiveness graph =
-  liftM check $ liftM snd $ dataflowPassBwd graph [] $ analBwd liveLattice xferLive
+  check $ dataflowAnalBwd graph [] $ analBwd liveLattice xferLive
   where entry = g_entry graph
-        check facts = noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts
+        check facts = noLiveOnEntry entry
+                        (expectJust "check" $ mapLookup entry facts) facts
 
 -- | On entry to the procedure, there had better not be any LocalReg's live-in.
 noLiveOnEntry :: BlockId -> CmmLive -> a -> a
 noLiveOnEntry bid in_fact x =
-  if isEmptyUniqSet in_fact then x
+  if nullRegSet in_fact then x
   else pprPanic "LocalReg's live-in to graph" (ppr bid <+> ppr in_fact)
 
 -- | The transfer equations use the traditional 'gen' and 'kill'
@@ -60,42 +61,42 @@ noLiveOnEntry bid in_fact x =
 gen  :: UserOfLocalRegs a    => a -> RegSet -> RegSet
 gen  a live = foldRegsUsed extendRegSet      live a
 kill :: DefinerOfLocalRegs a => a -> RegSet -> RegSet
-kill a live = foldRegsDefd delOneFromUniqSet live a
+kill a live = foldRegsDefd deleteFromRegSet live a
 
-gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a) => a -> CmmLive -> CmmLive
+gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a)
+          => a -> CmmLive -> CmmLive
 gen_kill a = gen a . kill a
 
 -- | The transfer function
--- EZY: Bits of this analysis are duplicated in CmmSpillReload, though
--- it's not really easy to efficiently reuse all of this.  Keep in mind
--- if you need to update this analysis.
 xferLive :: BwdTransfer CmmNode CmmLive
 xferLive = mkBTransfer3 fst mid lst
   where fst _ f = f
         mid :: CmmNode O O -> CmmLive -> CmmLive
         mid n f = gen_kill n f
         lst :: CmmNode O C -> FactBase CmmLive -> CmmLive
-        -- slightly inefficient: kill is unnecessary for emptyRegSet
-        lst n f = gen_kill n
-                $ case n of CmmCall{}        -> emptyRegSet
-                            CmmForeignCall{} -> emptyRegSet
-                            _                -> joinOutFacts liveLattice n f
+        lst n f = gen_kill n $ joinOutFacts liveLattice n f
 
 -----------------------------------------------------------------------------
 -- Removing assignments to dead variables
 -----------------------------------------------------------------------------
 
-removeDeadAssignments :: CmmGraph -> FuelUniqSM CmmGraph
+removeDeadAssignments :: CmmGraph -> UniqSM (CmmGraph, BlockEnv CmmLive)
 removeDeadAssignments g =
-   liftM fst $ dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
-   where rewrites = deepBwdRw3 nothing middle nothing
-         -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-         -- but GHC panics while compiling, see bug #4045.
+   dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
+   where rewrites = mkBRewrite3 nothing middle nothing
+         -- SDM: no need for deepBwdRw here, we only rewrite to empty
+         -- Beware: deepBwdRw with one polymorphic function seems more
+         -- reasonable here, but GHC panics while compiling, see bug
+         -- #4045.
          middle :: CmmNode O O -> Fact O CmmLive -> CmmReplGraph O O
-         middle (CmmAssign (CmmLocal reg') _) live | not (reg' `elemRegSet` live) = return $ Just emptyGraph
+         middle (CmmAssign (CmmLocal reg') _) live
+                 | not (reg' `elemRegSet` live)
+                 = return $ Just emptyGraph
          -- XXX maybe this should be somewhere else...
-         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs = return $ Just emptyGraph
-         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs = return $ Just emptyGraph
+         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs
+                 = return $ Just emptyGraph
+         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs
+                 = return $ Just emptyGraph
          middle _ _ = return Nothing
 
          nothing :: CmmNode e x -> Fact x CmmLive -> CmmReplGraph e x
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index 4844af9d9a..9e75387436 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -18,7 +18,7 @@ module CmmNode (
      CmmNode(..), ForeignHint(..), CmmFormal, CmmActual,
      UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..),
      mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
-     mapExpM, mapExpDeepM, wrapRecExpM
+     mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors
   ) where
 
 import CmmExpr
@@ -35,15 +35,17 @@ import Prelude hiding (succ)
 ------------------------
 -- CmmNode
 
+#define ULabel {-# UNPACK #-} !Label
+
 data CmmNode e x where
-  CmmEntry :: Label -> CmmNode C O
+  CmmEntry :: ULabel -> CmmNode C O
 
   CmmComment :: FastString -> CmmNode O O
 
-  CmmAssign :: CmmReg -> CmmExpr -> CmmNode O O
+  CmmAssign :: !CmmReg -> !CmmExpr -> CmmNode O O
     -- Assign to register
 
-  CmmStore :: CmmExpr -> CmmExpr -> CmmNode O O
+  CmmStore :: !CmmExpr -> !CmmExpr -> CmmNode O O
     -- Assign to memory location.  Size is
     -- given by cmmExprType of the rhs.
 
@@ -60,11 +62,12 @@ data CmmNode e x where
       --            bug for what can be put in arguments, see
       --            Note [Register Parameter Passing]
 
-  CmmBranch :: Label -> CmmNode O C  -- Goto another block in the same procedure
+  CmmBranch :: ULabel -> CmmNode O C
+                                   -- Goto another block in the same procedure
 
   CmmCondBranch :: {                 -- conditional branch
       cml_pred :: CmmExpr,
-      cml_true, cml_false :: Label
+      cml_true, cml_false :: ULabel
   } -> CmmNode O C
 
   CmmSwitch :: CmmExpr -> [Maybe Label] -> CmmNode O C -- Table branch
@@ -78,6 +81,11 @@ data CmmNode e x where
 
       cml_cont :: Maybe Label,
           -- Label of continuation (Nothing for return or tail call)
+          --
+          -- Note [Continuation BlockId]: these BlockIds are called
+          -- Continuation BlockIds, and are the only BlockIds that can
+          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
+          -- (CmmStackSlot (Young b) _).
 
 -- ToDO: add this:
 --       cml_args_regs :: [GlobalReg],
@@ -117,7 +125,7 @@ data CmmNode e x where
       tgt   :: ForeignTarget,   -- call target and convention
       res   :: [CmmFormal],     -- zero or more results
       args  :: [CmmActual],     -- zero or more arguments; see Note [Register parameter passing]
-      succ  :: Label,           -- Label of continuation
+      succ  :: ULabel,          -- Label of continuation
       updfr :: UpdFrameOffset,  -- where the update frame is (for building infotable)
       intrbl:: Bool             -- whether or not the call is interruptible
   } -> CmmNode O C
@@ -218,14 +226,6 @@ data Convention
   | GC               -- Entry to the garbage collector: uses the node reg!
   | PrimOpCall       -- Calling prim ops
   | PrimOpReturn     -- Returning from prim ops
-  | Foreign          -- Foreign call/return
-        ForeignConvention
-  | Private
-        -- Used for control transfers within a (pre-CPS) procedure All
-        -- jump sites known, never pushed on the stack (hence no SRT)
-        -- You can choose whatever calling convention you please
-        -- (provided you make sure all the call sites agree)!
-        -- This data type eventually to be extended to record the convention.
   deriving( Eq )
 
 data ForeignConvention
@@ -283,37 +283,6 @@ instance DefinerOfLocalRegs (CmmNode e x) where
           fold f z n = foldRegsDefd f z n
 
 
-instance UserOfSlots (CmmNode e x) where
-  foldSlotsUsed f z n = case n of
-    CmmAssign _ expr -> fold f z expr
-    CmmStore addr rval -> fold f (fold f z addr) rval
-    CmmUnsafeForeignCall _ _ args -> fold f z args
-    CmmCondBranch expr _ _ -> fold f z expr
-    CmmSwitch expr _ -> fold f z expr
-    CmmCall {cml_target=tgt} -> fold f z tgt
-    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
-    _ -> z
-    where fold :: forall a b.
-                       UserOfSlots a =>
-                       (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsUsed f z n
-
-instance UserOfSlots ForeignTarget where
-  foldSlotsUsed  f z (ForeignTarget e _) = foldSlotsUsed f z e
-  foldSlotsUsed _f z (PrimTarget _)      = z
-
-instance DefinerOfSlots (CmmNode e x) where
-  foldSlotsDefd f z n = case n of
-    CmmStore (CmmStackSlot a i) expr -> f z (a, i, widthInBytes $ typeWidth $ cmmExprType expr)
-    CmmForeignCall {res=res} -> fold f z $ map foreign_call_slot res
-    _ -> z
-    where
-          fold :: forall a b.
-                  DefinerOfSlots a =>
-                  (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsDefd f z n
-          foreign_call_slot r = case widthInBytes $ typeWidth $ localRegType r of w -> (RegSlot r, w, w)
-
 -----------------------------------
 -- mapping Expr in CmmNode
 
@@ -416,4 +385,20 @@ foldExp f (CmmCall {cml_target=tgt}) z            = f tgt z
 foldExp f (CmmForeignCall {tgt=tgt, args=args}) z = foldr f (foldExpForeignTarget f tgt z) args
 
 foldExpDeep :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
-foldExpDeep f = foldExp $ wrapRecExpf f
+foldExpDeep f = foldExp go
+  where -- go :: CmmExpr -> z -> z
+        go e@(CmmMachOp _ es) z = gos es $! f e z
+        go e@(CmmLoad addr _) z = go addr $! f e z
+        go e                  z = f e z
+
+        gos [] z = z
+        gos (e:es) z = gos es $! f e z
+
+-- -----------------------------------------------------------------------------
+
+mapSuccessors :: (Label -> Label) -> CmmNode O C -> CmmNode O C
+mapSuccessors f (CmmBranch bid)        = CmmBranch (f bid)
+mapSuccessors f (CmmCondBranch p y n)  = CmmCondBranch p (f y) (f n)
+mapSuccessors f (CmmSwitch e arms)     = CmmSwitch e (map (fmap f) arms)
+mapSuccessors _ n = n
+
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 8cc18fc1ca..8ff04cfa7b 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -145,8 +145,7 @@ To inline _smi:
 -}
 
 countUses :: UserOfLocalRegs a => a -> UniqFM Int
-countUses a = foldRegsUsed (\m r -> addToUFM m r (count m r + 1)) emptyUFM a
-  where count m r = lookupWithDefaultUFM m (0::Int) r
+countUses a = foldRegsUsed (\m r -> addToUFM_C (+) m r 1) emptyUFM a
 
 cmmMiniInline :: DynFlags -> [CmmBasicBlock] -> [CmmBasicBlock]
 cmmMiniInline dflags blocks = map do_inline blocks
@@ -157,25 +156,16 @@ cmmMiniInlineStmts :: DynFlags -> UniqFM Int -> [CmmStmt] -> [CmmStmt]
 cmmMiniInlineStmts _      _    [] = []
 cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr) : stmts)
         -- not used: just discard this assignment
-  | Nothing <- lookupUFM uses u
+  | 0 <- lookupWithDefaultUFM uses 0 u
   = cmmMiniInlineStmts dflags uses stmts
 
-        -- used (literal): try to inline at all the use sites
-  | Just n <- lookupUFM uses u, isLit expr
-  =
-     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u expr stmts of
-         (m, stmts')
-             | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
-             | otherwise ->
-                 stmt : cmmMiniInlineStmts dflags (adjustUFM (\x -> x - m) uses u) stmts'
-
-        -- used (foldable to literal): try to inline at all the use sites
+        -- used (foldable to small thing): try to inline at all the use sites
   | Just n <- lookupUFM uses u,
-    e@(CmmLit _) <- wrapRecExp foldExp expr
+    e <- wrapRecExp foldExp expr,
+    isTiny e
   =
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u e stmts of
+     case lookForInlineMany u e stmts of
          (m, stmts')
              | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
              | otherwise ->
@@ -188,6 +178,11 @@ cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr)
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
      cmmMiniInlineStmts dflags uses stmts'
  where
+  isTiny (CmmLit _) = True
+  isTiny (CmmReg (CmmGlobal _)) = True
+         -- not CmmLocal: that might invalidate the usage analysis results
+  isTiny _ = False
+
   platform = targetPlatform dflags
   foldExp (CmmMachOp op args) = cmmMachOpFold platform op args
   foldExp e = e
@@ -201,26 +196,28 @@ cmmMiniInlineStmts platform uses (stmt:stmts)
 -- register, and a list of statements.  Inlines the expression at all
 -- use sites of the register.  Returns the number of substituations
 -- made and the, possibly modified, list of statements.
-lookForInlineLit :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
-lookForInlineLit _ _ [] = (0, [])
-lookForInlineLit u expr stmts@(stmt : rest)
-  | Just n <- lookupUFM (countUses stmt) u
-  = case lookForInlineLit u expr rest of
-      (m, stmts) -> let z = n + m
-                    in z `seq` (z, inlineStmt u expr stmt : stmts)
-
-  | ok_to_skip
-  = case lookForInlineLit u expr rest of
+lookForInlineMany :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany u expr stmts = lookForInlineMany' u expr regset stmts
+    where regset = foldRegsUsed extendRegSet emptyRegSet expr
+
+lookForInlineMany' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany' _ _ _ [] = (0, [])
+lookForInlineMany' u expr regset stmts@(stmt : rest)
+  | Just n <- lookupUFM (countUses stmt) u, okToInline expr stmt
+  = let stmt' = inlineStmt u expr stmt in
+    if okToSkip stmt' u expr regset
+       then case lookForInlineMany' u expr regset rest of
+                       (m, stmts) -> let z = n + m
+                                     in z `seq` (z, stmt' : stmts)
+       else (n, stmt' : rest)
+
+  | okToSkip stmt u expr regset
+  = case lookForInlineMany' u expr regset rest of
       (n, stmts) -> (n, stmt : stmts)
 
   | otherwise
   = (0, stmts)
-  where
-    -- We skip over assignments to registers, unless the register
-    -- being assigned to is the one we're inlining.
-    ok_to_skip = case stmt of
-        CmmAssign (CmmLocal (LocalReg u' _)) _ | u' == u -> False
-        _other -> True
+
 
 lookForInline :: Unique -> CmmExpr -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline u expr stmts = lookForInline' u expr regset stmts
@@ -229,10 +226,10 @@ lookForInline u expr stmts = lookForInline' u expr regset stmts
 lookForInline' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline' _ _    _      [] = panic "lookForInline' []"
 lookForInline' u expr regset (stmt : rest)
-  | Just 1 <- lookupUFM (countUses stmt) u, ok_to_inline
+  | Just 1 <- lookupUFM (countUses stmt) u, okToInline expr stmt
   = Just (inlineStmt u expr stmt : rest)
 
-  | ok_to_skip
+  | okToSkip stmt u expr regset
   = case lookForInline' u expr regset rest of
            Nothing    -> Nothing
            Just stmts -> Just (stmt:stmts)
@@ -240,31 +237,37 @@ lookForInline' u expr regset (stmt : rest)
   | otherwise 
   = Nothing
 
-  where
-        -- we don't inline into CmmCall if the expression refers to global
-        -- registers.  This is a HACK to avoid global registers clashing with
-        -- C argument-passing registers, really the back-end ought to be able
-        -- to handle it properly, but currently neither PprC nor the NCG can
-        -- do it.  See also CgForeignCall:load_args_into_temps.
-    ok_to_inline = case stmt of
-                     CmmCall{} -> hasNoGlobalRegs expr
-                     _ -> True
-
-   -- Expressions aren't side-effecting.  Temporaries may or may not
-   -- be single-assignment depending on the source (the old code
-   -- generator creates single-assignment code, but hand-written Cmm
-   -- and Cmm from the new code generator is not single-assignment.)
-   -- So we do an extra check to make sure that the register being
-   -- changed is not one we were relying on.  I don't know how much of a
-   -- performance hit this is (we have to create a regset for every
-   -- instruction.) -- EZY
-    ok_to_skip = case stmt of
-                 CmmNop -> True
-                 CmmComment{} -> True
-                 CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
-                 CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
-                 _other -> False
 
+-- we don't inline into CmmCall if the expression refers to global
+-- registers.  This is a HACK to avoid global registers clashing with
+-- C argument-passing registers, really the back-end ought to be able
+-- to handle it properly, but currently neither PprC nor the NCG can
+-- do it.  See also CgForeignCall:load_args_into_temps.
+okToInline :: CmmExpr -> CmmStmt -> Bool
+okToInline expr CmmCall{} = hasNoGlobalRegs expr
+okToInline _ _ = True
+
+-- Expressions aren't side-effecting.  Temporaries may or may not
+-- be single-assignment depending on the source (the old code
+-- generator creates single-assignment code, but hand-written Cmm
+-- and Cmm from the new code generator is not single-assignment.)
+-- So we do an extra check to make sure that the register being
+-- changed is not one we were relying on.  I don't know how much of a
+-- performance hit this is (we have to create a regset for every
+-- instruction.) -- EZY
+okToSkip :: CmmStmt -> Unique -> CmmExpr -> RegSet -> Bool
+okToSkip stmt u expr regset
+   = case stmt of
+         CmmNop -> True
+         CmmComment{} -> True
+         CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
+         CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
+         CmmStore _ _ -> not_a_load expr
+         _other -> False
+  where
+    not_a_load (CmmMachOp _ args) = all not_a_load args
+    not_a_load (CmmLoad _ _) = False
+    not_a_load _ = True
 
 inlineStmt :: Unique -> CmmExpr -> CmmStmt -> CmmStmt
 inlineStmt u a (CmmAssign r e) = CmmAssign r (inlineExpr u a e)
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 075ed22ea9..f46d49e022 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -230,35 +230,31 @@ lits	:: { [ExtFCode CmmExpr] }
 
 cmmproc :: { ExtCode }
 -- TODO: add real SRT/info tables to parsed Cmm
-	: info maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
-		{ do ((entry_ret_label, info, live, formals, gc_block, frame), stmts) <-
+        : info maybe_formals_without_hints '{' body '}'
+                { do ((entry_ret_label, info, live, formals), stmts) <-
 		       getCgStmtsEC' $ loopDecls $ do {
 		         (entry_ret_label, info, live) <- $1;
 		         formals <- sequence $2;
-		         gc_block <- $3;
-		         frame <- $4;
-		         $6;
-		         return (entry_ret_label, info, live, formals, gc_block, frame) }
+                         $4;
+                         return (entry_ret_label, info, live, formals) }
 		     blks <- code (cgStmtsToBlocks stmts)
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo gc_block frame info) formals blks) }
+                     code (emitInfoTableAndCode entry_ret_label info formals blks) }
 
 	| info maybe_formals_without_hints ';'
 		{ do (entry_ret_label, info, live) <- $1;
 		     formals <- sequence $2;
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo Nothing Nothing info) formals []) }
+                     code (emitInfoTableAndCode entry_ret_label info formals []) }
 
-	| NAME maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
+        | NAME maybe_formals_without_hints '{' body '}'
 		{% withThisPackage $ \pkg ->
 		   do	newFunctionName $1 pkg
-		   	((formals, gc_block, frame), stmts) <-
+                        (formals, stmts) <-
 			 	getCgStmtsEC' $ loopDecls $ do {
 		          		formals <- sequence $2;
-		          		gc_block <- $3;
-			  		frame <- $4;
-		          		$6;
-		          		return (formals, gc_block, frame) }
+                                        $4;
+                                        return formals }
 			blks <- code (cgStmtsToBlocks stmts)
-			code (emitProc (CmmInfo gc_block frame CmmNonInfoTable) (mkCmmCodeLabel pkg $1) formals blks) }
+                        code (emitProc CmmNonInfoTable (mkCmmCodeLabel pkg $1) formals blks) }
 
 info	:: { ExtFCode (CLabel, CmmInfoTable, [Maybe LocalReg]) }
 	: 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
@@ -599,18 +595,7 @@ formals_without_hints :: { [ExtFCode LocalReg] }
 formal_without_hint :: { ExtFCode LocalReg }
 	: type NAME		{ newLocal $1 $2 }
 
-maybe_frame :: { ExtFCode (Maybe UpdateFrame) }
-	: {- empty -}			{ return Nothing }
-	| 'jump' expr '(' exprs0 ')'	{ do { target <- $2;
-					       args <- sequence $4;
-					       return $ Just (UpdateFrame target args) } }
-
-maybe_gc_block :: { ExtFCode (Maybe BlockId) }
-	: {- empty -}			{ return Nothing }
-	| 'goto' NAME
-		{ do l <- lookupLabel $2; return (Just l) }
-
-type	:: { CmmType }
+type    :: { CmmType }
 	: 'bits8'		{ b8 }
 	| typenot8		{ $1 }
 
@@ -1073,7 +1058,8 @@ parseCmmFile dflags filename = do
         let msg = mkPlainErrMsg dflags span err
         return ((emptyBag, unitBag msg), Nothing)
     POk pst code -> do
-        cmm <- initC dflags no_module (getCmm (unEC code initEnv [] >> return ()))
+        st <- initC
+        let (cmm,_) = runC dflags no_module st (getCmm (unEC code initEnv [] >> return ()))
         let ms = getMessages pst
         if (errorsFound dflags ms)
          then return (ms, Nothing)
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index 7361bbf385..bb8d5b2f22 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -11,25 +11,23 @@ module CmmPipeline (
 
 import CLabel
 import Cmm
-import CmmLive
+import CmmLint
 import CmmBuildInfoTables
 import CmmCommonBlockElim
 import CmmProcPoint
-import CmmSpillReload
-import CmmRewriteAssignments
-import CmmStackLayout
 import CmmContFlowOpt
-import OptimizationFuel
+import CmmLayoutStack
 
+import UniqSupply
 import DynFlags
 import ErrUtils
 import HscTypes
 import Data.Maybe
 import Control.Monad
-import Data.Map (Map)
-import qualified Data.Map as Map
 import Outputable
-import StaticFlags
+
+import qualified Data.Set as Set
+import Data.Map (Map)
 
 -----------------------------------------------------------------------------
 -- | Top level driver for C-- pipeline
@@ -53,32 +51,28 @@ import StaticFlags
 --    we actually need to do the initial pass.
 cmmPipeline  :: HscEnv -- Compilation env including
                        -- dynamic flags: -dcmm-lint -ddump-cps-cmm
-             -> (TopSRT, [CmmGroup])    -- SRT table and accumulating list of compiled procs
+             -> TopSRT     -- SRT table and accumulating list of compiled procs
              -> CmmGroup             -- Input C-- with Procedures
-             -> IO (TopSRT, [CmmGroup]) -- Output CPS transformed C--
-cmmPipeline hsc_env (topSRT, rst) prog =
+             -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
+cmmPipeline hsc_env topSRT prog =
   do let dflags = hsc_dflags hsc_env
      --
      showPass dflags "CPSZ"
 
-     let tops = runCmmContFlowOpts prog
-     (cafEnvs, tops) <- liftM unzip $ mapM (cpsTop hsc_env) tops
+     (cafEnvs, tops) <- {-# SCC "tops" #-} liftM unzip $ mapM (cpsTop hsc_env) prog
      -- tops :: [[(CmmDecl,CAFSet]]  (one list per group)
 
-     let topCAFEnv = mkTopCAFInfo (concat cafEnvs)
+     let topCAFEnv = {-# SCC "topCAFEnv" #-} mkTopCAFInfo (concat cafEnvs)
 
      -- folding over the groups
-     (topSRT, tops) <- foldM (toTops hsc_env topCAFEnv) (topSRT, []) tops
+     (topSRT, tops) <- {-# SCC "toTops" #-} foldM (toTops topCAFEnv) (topSRT, []) tops
 
      let cmms :: CmmGroup
          cmms = reverse (concat tops)
 
      dumpIfSet_dyn dflags Opt_D_dump_cps_cmm "Post CPS Cmm" (ppr cmms)
 
-     -- SRT is not affected by control flow optimization pass
-     let prog' = runCmmContFlowOpts cmms
-
-     return (topSRT, prog' : rst)
+     return (topSRT, cmms)
 
 {- [Note global fuel]
 ~~~~~~~~~~~~~~~~~~~~~
@@ -92,105 +86,110 @@ global to one compiler session.
 -- -ddump-cmmz
 
 cpsTop :: HscEnv -> CmmDecl -> IO ([(CLabel, CAFSet)], [(CAFSet, CmmDecl)])
-cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
+cpsTop _ p@(CmmData {}) = return ([], [(Set.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
-       -- Why bother doing these early: dualLivenessWithInsertion,
-       -- insertLateReloads, rewriteAssignments?
+       ----------- Control-flow optimisations ---------------
+       g <- {-# SCC "cmmCfgOpts(1)" #-} return $ cmmCfgOpts g
+       dump Opt_D_dump_cmmz_cfg "Post control-flow optimsations" g
 
        ----------- Eliminate common blocks -------------------
-       g <- return $ elimCommonBlocks g
+       g <- {-# SCC "elimCommonBlocks" #-} return $ elimCommonBlocks g
        dump Opt_D_dump_cmmz_cbe "Post common block elimination" g
-       -- Any work storing block Labels must be performed _after_ elimCommonBlocks
+       -- Any work storing block Labels must be performed _after_
+       -- elimCommonBlocks
 
        ----------- Proc points -------------------
-       let callPPs = callProcPoints g
-       procPoints <- run $ minimalProcPointSet (targetPlatform dflags) callPPs g
-       g <- run $ addProcPointProtocols callPPs procPoints g
-       dump Opt_D_dump_cmmz_proc "Post Proc Points Added" g
-
-       ----------- Spills and reloads -------------------
-       g <- run $ dualLivenessWithInsertion procPoints g
-       dump Opt_D_dump_cmmz_spills "Post spills and reloads" g
-
-       ----------- Sink and inline assignments -------------------
-       g <- runOptimization $ rewriteAssignments platform g
-       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
-
-       ----------- Eliminate dead assignments -------------------
-       g <- runOptimization $ removeDeadAssignments g
-       dump Opt_D_dump_cmmz_dead "Post remove dead assignments" g
-
-       ----------- Zero dead stack slots (Debug only) ---------------
-       -- Debugging: stubbing slots on death can cause crashes early
-       g <- if opt_StubDeadValues
-                then run $ stubSlotsOnDeath g
-                else return g
-       dump Opt_D_dump_cmmz_stub "Post stub dead stack slots" g
-
-       --------------- Stack layout ----------------
-       slotEnv <- run $ liveSlotAnal g
-       let spEntryMap = getSpEntryMap entry_off g
-       mbpprTrace "live slot analysis results: " (ppr slotEnv) $ return ()
-       let areaMap = layout procPoints spEntryMap slotEnv entry_off g
-       mbpprTrace "areaMap" (ppr areaMap) $ return ()
-
-       ------------  Manifest the stack pointer --------
-       g  <- run $ manifestSP spEntryMap areaMap entry_off g
-       dump Opt_D_dump_cmmz_sp "Post manifestSP" g
-       -- UGH... manifestSP can require updates to the procPointMap.
-       -- We can probably do something quicker here for the update...
+       let callPPs = {-# SCC "callProcPoints" #-} callProcPoints g
+       procPoints <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
+                     minimalProcPointSet (targetPlatform dflags) callPPs g
+
+       ----------- Layout the stack and manifest Sp ---------------
+       -- (also does: removeDeadAssignments, and lowerSafeForeignCalls)
+       (g, stackmaps) <- {-# SCC "layoutStack" #-}
+                         runUniqSM $ cmmLayoutStack procPoints entry_off g
+       dump Opt_D_dump_cmmz_sp "Layout Stack" g
+
+--       g <- {-# SCC "sink" #-} runUniqSM $ cmmSink g
+--       dump Opt_D_dump_cmmz_rewrite "Sink assignments" g
+
+--       ----------- Sink and inline assignments -------------------
+--       g <- {-# SCC "rewriteAssignments" #-} runOptimization $
+--            rewriteAssignments platform g
+--       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
 
        ------------- Split into separate procedures ------------
-       procPointMap  <- run $ procPointAnalysis procPoints g
-       dump Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
-       gs <- run $ splitAtProcPoints l callPPs procPoints procPointMap
-                                       (CmmProc h l g)
-       mapM_ (dump Opt_D_dump_cmmz_split "Post splitting") gs
-
-       ------------- More CAFs and foreign calls ------------
-       cafEnv <- run $ cafAnal g
-       let localCAFs = catMaybes $ map (localCAFInfo cafEnv) gs
+       procPointMap  <- {-# SCC "procPointAnalysis" #-} runUniqSM $
+                        procPointAnalysis procPoints g
+       dumpWith dflags Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
+       gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
+             splitAtProcPoints l callPPs procPoints procPointMap (CmmProc h l g)
+       dumps Opt_D_dump_cmmz_split "Post splitting" gs
+
+       ------------- More CAFs ------------------------------
+       let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
+       let localCAFs = {-# SCC "localCAFs" #-} catMaybes $ map (localCAFInfo cafEnv) gs
        mbpprTrace "localCAFs" (ppr localCAFs) $ return ()
 
-       gs <- run $ mapM (lowerSafeForeignCalls areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_lower "Post lowerSafeForeignCalls") gs
-
        -- NO MORE GRAPH TRANSFORMATION AFTER HERE -- JUST MAKING INFOTABLES
-       gs <- return $ map (setInfoTableStackMap slotEnv areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_info "after setInfoTableStackMap") gs
-       gs <- return $ map (bundleCAFs cafEnv) gs
-       mapM_ (dump Opt_D_dump_cmmz_cafs "after bundleCAFs") gs
+       gs <- {-# SCC "setInfoTableStackMap" #-}
+             return $ map (setInfoTableStackMap stackmaps) gs
+       dumps Opt_D_dump_cmmz_info "after setInfoTableStackMap" gs
+
+       ----------- Control-flow optimisations ---------------
+       gs <- {-# SCC "cmmCfgOpts(2)" #-} return $ map cmmCfgOptsProc gs
+       dumps Opt_D_dump_cmmz_cfg "Post control-flow optimsations" gs
+
+       gs <- {-# SCC "bundleCAFs" #-} return $ map (bundleCAFs cafEnv) gs
+       dumps Opt_D_dump_cmmz_cafs "after bundleCAFs" gs
+
        return (localCAFs, gs)
 
               -- gs        :: [ (CAFSet, CmmDecl) ]
               -- localCAFs :: [ (CLabel, CAFSet) ] -- statics filtered out(?)
 
   where dflags = hsc_dflags hsc_env
-        platform = targetPlatform dflags
-        mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
-        dump f = dumpWith ppr f
-        dumpWith pprFun f txt g = do
-            -- ToDo: No easy way of say "dump all the cmmz, *and* split
-            -- them into files."  Also, -ddump-cmmz doesn't play nicely
-            -- with -ddump-to-file, since the headers get omitted.
-            dumpIfSet_dyn dflags f txt (pprFun g)
-            when (not (dopt f dflags)) $
-                dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (pprFun g)
-        -- Runs a required transformation/analysis
-        run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
-        -- Runs an optional transformation/analysis (and should
-        -- thus be subject to optimization fuel)
-        runOptimization = runFuelIO (hsc_OptFuel hsc_env)
+        mbpprTrace x y z | dopt Opt_D_dump_cmmz dflags = pprTrace x y z
+                         | otherwise = z
+        dump = dumpGraph dflags
+
+        dumps flag name
+           = mapM_ (dumpWith dflags flag name)
+
+runUniqSM :: UniqSM a -> IO a
+runUniqSM m = do
+  us <- mkSplitUniqSupply 'u'
+  return (initUs_ us m)
+
+
+dumpGraph :: DynFlags -> DynFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+  when (dopt Opt_DoCmmLinting dflags) $ do_lint g
+  dumpWith dflags flag name g
+ where
+  do_lint g = case cmmLintGraph g of
+                 Just err -> do { fatalErrorMsg dflags err
+                                ; ghcExit dflags 1
+                                }
+                 Nothing  -> return ()
+
+dumpWith :: Outputable a => DynFlags -> DynFlag -> String -> a -> IO ()
+dumpWith dflags flag txt g = do
+         -- ToDo: No easy way of say "dump all the cmmz, *and* split
+         -- them into files."  Also, -ddump-cmmz doesn't play nicely
+         -- with -ddump-to-file, since the headers get omitted.
+   dumpIfSet_dyn dflags flag txt (ppr g)
+   when (not (dopt flag dflags)) $
+      dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (ppr g)
 
 -- This probably belongs in CmmBuildInfoTables?
 -- We're just finishing the job here: once we know what CAFs are defined
 -- in non-static closures, we can build the SRTs.
-toTops :: HscEnv -> Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
-                 -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
-toTops hsc_env topCAFEnv (topSRT, tops) gs =
+toTops :: Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
+       -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
+toTops topCAFEnv (topSRT, tops) gs =
   do let setSRT (topSRT, rst) g =
            do (topSRT, gs) <- setInfoTableSRT topCAFEnv topSRT g
               return (topSRT, gs : rst)
-     (topSRT, gs') <- runFuelIO (hsc_OptFuel hsc_env) $ foldM setSRT (topSRT, []) gs
+     (topSRT, gs') <- runUniqSM $ foldM setSRT (topSRT, []) gs
      return (topSRT, concat gs' : tops)
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index f50d850b3a..6eb92666af 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -4,7 +4,7 @@
 module CmmProcPoint
     ( ProcPointSet, Status(..)
     , callProcPoints, minimalProcPointSet
-    , addProcPointProtocols, splitAtProcPoints, procPointAnalysis
+    , splitAtProcPoints, procPointAnalysis
     )
 where
 
@@ -13,22 +13,17 @@ import Prelude hiding (last, unzip, succ, zip)
 import BlockId
 import CLabel
 import Cmm
+import PprCmm ()
 import CmmUtils
-import CmmContFlowOpt
 import CmmInfo
-import CmmLive
-import Constants
 import Data.List (sortBy)
 import Maybes
-import MkGraph
 import Control.Monad
-import OptimizationFuel
 import Outputable
 import Platform
-import UniqSet
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import qualified Data.Map as Map
 
@@ -103,34 +98,50 @@ instance Outputable Status where
                     (hsep $ punctuate comma $ map ppr $ setElems ps)
   ppr ProcPoint = text "<procpt>"
 
-lattice :: DataflowLattice Status
-lattice = DataflowLattice "direct proc-point reachability" unreached add_to
-    where unreached = ReachedBy setEmpty
-          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
-          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint) -- because of previous case
-          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p')) =
-              let union = setUnion p' p
-              in  if setSize union > setSize p then (SomeChange, ReachedBy union)
-                                               else (NoChange, ReachedBy p)
 --------------------------------------------------
+-- Proc point analysis
+
+procPointAnalysis :: ProcPointSet -> CmmGraph -> UniqSM (BlockEnv Status)
+-- Once you know what the proc-points are, figure out
+-- what proc-points each block is reachable from
+procPointAnalysis procPoints g =
+  -- pprTrace "procPointAnalysis" (ppr procPoints) $
+  dataflowAnalFwdBlocks g initProcPoints $ analFwd lattice forward
+  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+
 -- transfer equations
 
 forward :: FwdTransfer CmmNode Status
-forward = mkFTransfer3 first middle ((mkFactBase lattice . ) . last)
-    where first :: CmmNode C O -> Status -> Status
-          first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
-          first  _ x = x
+forward = mkFTransfer3 first middle last
+    where
+      first :: CmmNode C O -> Status -> Status
+      first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
+      first  _ x = x
 
-          middle _ x = x
+      middle _ x = x
 
-          last :: CmmNode O C -> Status -> [(Label, Status)]
-          last (CmmCall {cml_cont = Just k}) _ = [(k, ProcPoint)]
-          last (CmmForeignCall {succ = k})   _ = [(k, ProcPoint)]
-          last l x = map (\id -> (id, x)) (successors l)
+      last :: CmmNode O C -> Status -> FactBase Status
+      last l x = mkFactBase lattice $ map (\id -> (id, x)) (successors l)
 
--- It is worth distinguishing two sets of proc points:
--- those that are induced by calls in the original graph
--- and those that are introduced because they're reachable from multiple proc points.
+lattice :: DataflowLattice Status
+lattice = DataflowLattice "direct proc-point reachability" unreached add_to
+    where unreached = ReachedBy setEmpty
+          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
+          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint)
+                       -- because of previous case
+          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p'))
+             | setSize union > setSize p = (SomeChange, ReachedBy union)
+             | otherwise                 = (NoChange, ReachedBy p)
+           where
+             union = setUnion p' p
+
+----------------------------------------------------------------------
+
+-- It is worth distinguishing two sets of proc points: those that are
+-- induced by calls in the original graph and those that are
+-- introduced because they're reachable from multiple proc points.
+--
+-- Extract the set of Continuation BlockIds, see Note [Continuation BlockIds].
 callProcPoints      :: CmmGraph -> ProcPointSet
 callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
   where add :: CmmBlock -> BlockSet -> BlockSet
@@ -139,21 +150,17 @@ callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
                       CmmForeignCall {succ=k}     -> setInsert k set
                       _ -> set
 
-minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph -> FuelUniqSM ProcPointSet
+minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
+                    -> UniqSM ProcPointSet
 -- Given the set of successors of calls (which must be proc-points)
 -- figure out the minimal set of necessary proc-points
-minimalProcPointSet platform callProcPoints g = extendPPSet platform g (postorderDfs g) callProcPoints
+minimalProcPointSet platform callProcPoints g
+  = extendPPSet platform g (postorderDfs g) callProcPoints
 
-procPointAnalysis :: ProcPointSet -> CmmGraph -> FuelUniqSM (BlockEnv Status)
--- Once you know what the proc-points are, figure out
--- what proc-points each block is reachable from
-procPointAnalysis procPoints g =
-  liftM snd $ dataflowPassFwd g initProcPoints $ analFwd lattice forward
-  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
-
-extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> FuelUniqSM ProcPointSet
+extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> UniqSM ProcPointSet
 extendPPSet platform g blocks procPoints =
     do env <- procPointAnalysis procPoints g
+       -- pprTrace "extensPPSet" (ppr env) $ return ()
        let add block pps = let id = entryLabel block
                            in  case mapLookup id env of
                                  Just ProcPoint -> setInsert id pps
@@ -179,183 +186,13 @@ extendPPSet platform g blocks procPoints =
            pps -> extendPPSet g blocks
                     (foldl extendBlockSet procPoints' pps)
 -}
-       case newPoint of Just id ->
-                          if setMember id procPoints' then panic "added old proc pt"
-                          else extendPPSet platform g blocks (setInsert id procPoints')
-                        Nothing -> return procPoints'
-
-
-------------------------------------------------------------------------
---                    Computing Proc-Point Protocols                  --
-------------------------------------------------------------------------
-
-{-
-
-There is one major trick, discovered by Michael Adams, which is that
-we want to choose protocols in a way that enables us to optimize away
-some continuations.  The optimization is very much like branch-chain
-elimination, except that it involves passing results as well as
-control.  The idea is that if a call's continuation k does nothing but
-CopyIn its results and then goto proc point P, the call's continuation
-may be changed to P, *provided* P's protocol is identical to the
-protocol for the CopyIn.  We choose protocols to make this so.
-
-Here's an explanatory example; we begin with the source code (lines
-separate basic blocks):
-
-  ..1..;
-  x, y = g();
-  goto P;
-  -------
-  P: ..2..;
-
-Zipperization converts this code as follows:
-
-  ..1..;
-  call g() returns to k;
-  -------
-  k: CopyIn(x, y);
-     goto P;
-  -------
-  P: ..2..;
-
-What we'd like to do is assign P the same CopyIn protocol as k, so we
-can eliminate k:
-
-  ..1..;
-  call g() returns to P;
-  -------
-  P: CopyIn(x, y); ..2..;
-
-Of course, P may be the target of more than one continuation, and
-different continuations may have different protocols.  Michael Adams
-implemented a voting mechanism, but he thinks a simple greedy
-algorithm would be just as good, so that's what we do.
-
--}
+       case newPoint of
+         Just id ->
+             if setMember id procPoints'
+                then panic "added old proc pt"
+                else extendPPSet platform g blocks (setInsert id procPoints')
+         Nothing -> return procPoints'
 
-data Protocol = Protocol Convention [CmmFormal] Area
-  deriving Eq
-instance Outputable Protocol where
-  ppr (Protocol c fs a) = text "Protocol" <+> ppr c <+> ppr fs <+> ppr a
-
--- | Function 'optimize_calls' chooses protocols only for those proc
--- points that are relevant to the optimization explained above.
--- The others are assigned by 'add_unassigned', which is not yet clever.
-
-addProcPointProtocols :: ProcPointSet -> ProcPointSet -> CmmGraph -> FuelUniqSM CmmGraph
-addProcPointProtocols callPPs procPoints g =
-  do liveness <- cmmLiveness g
-     (protos, g') <- optimize_calls liveness g
-     blocks'' <- add_CopyOuts protos procPoints g'
-     return $ ofBlockMap (g_entry g) blocks''
-    where optimize_calls liveness g =  -- see Note [Separate Adams optimization]
-            do let (protos, blocks') =
-                       foldGraphBlocks maybe_add_call (mapEmpty, mapEmpty) g
-                   protos' = add_unassigned liveness procPoints protos
-               let g' = ofBlockMap (g_entry g) (add_CopyIns callPPs protos' blocks')
-               return (protos', removeUnreachableBlocks g')
-          maybe_add_call :: CmmBlock -> (BlockEnv Protocol, BlockEnv CmmBlock)
-                         -> (BlockEnv Protocol, BlockEnv CmmBlock)
-          -- ^ If the block is a call whose continuation goes to a proc point
-          -- whose protocol either matches the continuation's or is not yet set,
-          -- redirect the call (cf 'newblock') and set the protocol if necessary
-          maybe_add_call block (protos, blocks) =
-              case lastNode block of
-                CmmCall tgt (Just k) args res s
-                    | Just proto <- mapLookup k protos,
-                      Just pee   <- branchesToProcPoint k
-                    -> let newblock = replaceLastNode block (CmmCall tgt (Just pee)
-                                                                     args res s)
-                           changed_blocks   = insertBlock newblock blocks
-                           unchanged_blocks = insertBlock block    blocks
-                       in case mapLookup pee protos of
-                            Nothing -> (mapInsert pee proto protos, changed_blocks)
-                            Just proto' ->
-                              if proto == proto' then (protos, changed_blocks)
-                              else (protos, unchanged_blocks)
-                _ -> (protos, insertBlock block blocks)
-
-          branchesToProcPoint :: BlockId -> Maybe BlockId
-          -- ^ Tells whether the named block is just a branch to a proc point
-          branchesToProcPoint id =
-              let block = mapLookup id (toBlockMap g) `orElse`
-                                    panic "branch out of graph"
-              in case blockToNodeList block of
-                   (_, [], JustC (CmmBranch pee)) | setMember pee procPoints -> Just pee
-                   _                                                         -> Nothing
-
--- | For now, following a suggestion by Ben Lippmeier, we pass all
--- live variables as arguments, hoping that a clever register
--- allocator might help.
-
-add_unassigned :: BlockEnv CmmLive -> ProcPointSet -> BlockEnv Protocol ->
-                  BlockEnv Protocol
-add_unassigned = pass_live_vars_as_args
-
-pass_live_vars_as_args :: BlockEnv CmmLive -> ProcPointSet ->
-                          BlockEnv Protocol -> BlockEnv Protocol
-pass_live_vars_as_args _liveness procPoints protos = protos'
-  where protos' = setFold addLiveVars protos procPoints
-        addLiveVars :: BlockId -> BlockEnv Protocol -> BlockEnv Protocol
-        addLiveVars id protos =
-            case mapLookup id protos of
-              Just _  -> protos
-              Nothing -> let live = emptyRegSet
-                                    --lookupBlockEnv _liveness id `orElse`
-                                    --panic ("no liveness at block " ++ show id)
-                             formals = uniqSetToList live
-                             prot = Protocol Private formals $ CallArea $ Young id
-                         in  mapInsert id prot protos
-
-
--- | Add copy-in instructions to each proc point that did not arise from a call
--- instruction. (Proc-points that arise from calls already have their copy-in instructions.)
-
-add_CopyIns :: ProcPointSet -> BlockEnv Protocol -> BlockEnv CmmBlock -> BlockEnv CmmBlock
-add_CopyIns callPPs protos blocks = mapFold maybe_insert_CopyIns mapEmpty blocks
-    where maybe_insert_CopyIns block blocks
-             | not $ setMember bid callPPs
-             , Just (Protocol c fs _area) <- mapLookup bid protos
-             = let nodes     = copyInSlot c fs
-                   (h, m, l) = blockToNodeList block
-               in insertBlock (blockOfNodeList (h, nodes ++ m, l)) blocks
-             | otherwise = insertBlock block blocks
-           where bid = entryLabel block
-
-
--- | Add a CopyOut node before each procpoint.
--- If the predecessor is a call, then the copy outs should already be done by the callee.
--- Note: If we need to add copy-out instructions, they may require stack space,
--- so we accumulate a map from the successors to the necessary stack space,
--- then update the successors after we have finished inserting the copy-outs.
-
-add_CopyOuts :: BlockEnv Protocol -> ProcPointSet -> CmmGraph ->
-                FuelUniqSM (BlockEnv CmmBlock)
-add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty) g
-    where mb_copy_out :: CmmBlock -> FuelUniqSM (BlockEnv CmmBlock) ->
-                                     FuelUniqSM (BlockEnv CmmBlock)
-          mb_copy_out b z | entryLabel b == g_entry g = skip b z
-          mb_copy_out b z =
-            case lastNode b of
-              CmmCall {}        -> skip b z -- copy out done by callee
-              CmmForeignCall {} -> skip b z -- copy out done by callee
-              _ -> copy_out b z
-          copy_out b z = foldr trySucc init (successors b) >>= finish
-            where init = (\bmap -> (b, bmap)) `liftM` z
-                  trySucc succId z =
-                    if setMember succId procPoints then
-                      case mapLookup succId protos of
-                        Nothing -> z
-                        Just (Protocol c fs _area) -> insert z succId $ copyOutSlot c fs
-                    else z
-                  insert z succId m =
-                    do (b, bmap) <- z
-                       (b, bs)   <- insertBetween b m succId
-                       -- pprTrace "insert for succ" (ppr succId <> ppr m) $ do
-                       return $ (b, foldl (flip insertBlock) bmap bs)
-                  finish (b, bmap) = return $ insertBlock b bmap
-          skip b bs = insertBlock b `liftM` bs
 
 -- At this point, we have found a set of procpoints, each of which should be
 -- the entry point of a procedure.
@@ -370,10 +207,9 @@ add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty)
 -- ToDo: use the _ret naming convention that the old code generator
 -- used. -- EZY
 splitAtProcPoints :: CLabel -> ProcPointSet-> ProcPointSet -> BlockEnv Status ->
-                     CmmDecl -> FuelUniqSM [CmmDecl]
+                     CmmDecl -> UniqSM [CmmDecl]
 splitAtProcPoints entry_label callPPs procPoints procMap
-                  (CmmProc (TopInfo {info_tbl=info_tbl,
-                                     stack_info=stack_info})
+                  (CmmProc (TopInfo {info_tbl=info_tbl})
                            top_l g@(CmmGraph {g_entry=entry})) =
   do -- Build a map from procpoints to the blocks they reach
      let addBlock b graphEnv =
@@ -384,15 +220,18 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  []   -> graphEnv
                  [id] -> add graphEnv id bid b 
                  _    -> panic "Each block should be reachable from only one ProcPoint"
-             Nothing -> pprPanic "block not reached by a proc point?" (ppr bid)
+             Nothing -> graphEnv
            where bid = entryLabel b
          add graphEnv procId bid b = mapInsert procId graph' graphEnv
                where graph  = mapLookup procId graphEnv `orElse` mapEmpty
                      graph' = mapInsert bid b graph
+
      graphEnv <- return $ foldGraphBlocks addBlock emptyBlockMap g
+
      -- Build a map from proc point BlockId to pairs of:
      --  * Labels for their new procedures
-     --  * Labels for the info tables of their new procedures (only if the proc point is a callPP)
+     --  * Labels for the info tables of their new procedures (only if
+     --    the proc point is a callPP)
      -- Due to common blockification, we may overestimate the set of procpoints.
      let add_label map pp = Map.insert pp lbls map
            where lbls | pp == entry = (entry_label, Just entry_info_lbl)
@@ -401,30 +240,15 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  entry_info_lbl = cit_lbl info_tbl
          procLabels = foldl add_label Map.empty
                             (filter (flip mapMember (toBlockMap g)) (setElems procPoints))
-     -- For each procpoint, we need to know the SP offset on entry.
-     -- If the procpoint is:
-     --  - continuation of a call, the SP offset is in the call
-     --  - otherwise, 0 (and left out of the spEntryMap)
-     let add_sp_off :: CmmBlock -> BlockEnv CmmStackInfo -> BlockEnv CmmStackInfo
-         add_sp_off b env =
-           case lastNode b of
-             CmmCall {cml_cont = Just succ, cml_ret_args = off, cml_ret_off = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = off, updfr_space = Just updfr_off}) env
-             CmmForeignCall {succ = succ, updfr = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = wORD_SIZE, updfr_space = Just updfr_off}) env
-             _ -> env
-         spEntryMap = foldGraphBlocks add_sp_off (mapInsert entry stack_info emptyBlockMap) g
-         getStackInfo id = mapLookup id spEntryMap `orElse` StackInfo {arg_space = 0, updfr_space = Nothing}
      -- In each new graph, add blocks jumping off to the new procedures,
      -- and replace branches to procpoints with branches to the jump-off blocks
      let add_jump_block (env, bs) (pp, l) =
            do bid <- liftM mkBlockId getUniqueM
-              let b = blockOfNodeList (JustC (CmmEntry bid), [], JustC jump)
-                  StackInfo {arg_space = argSpace, updfr_space = off} = getStackInfo pp
-                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing argSpace 0
-                                 (off `orElse` 0) -- Jump's shouldn't need the offset...
+              let b = blockJoin (CmmEntry bid) emptyBlock jump
+                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing 0 0 0
               return (mapInsert pp bid env, b : bs)
-         add_jumps (newGraphEnv) (ppId, blockEnv) =
+
+         add_jumps newGraphEnv (ppId, blockEnv) =
            do let needed_jumps = -- find which procpoints we currently branch to
                     mapFold add_if_branch_to_pp [] blockEnv
                   add_if_branch_to_pp :: CmmBlock -> [(BlockId, CLabel)] -> [(BlockId, CLabel)]
@@ -441,17 +265,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  foldM add_jump_block (mapEmpty, []) needed_jumps
                   -- update the entry block
               let b = expectJust "block in env" $ mapLookup ppId blockEnv
-                  off = getStackInfo ppId
                   blockEnv' = mapInsert ppId b blockEnv
                   -- replace branches to procpoints with branches to jumps
                   blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
                   -- add the jump blocks to the graph
                   blockEnv''' = foldl (flip insertBlock) blockEnv'' jumpBlocks
-              let g' = (off, ofBlockMap ppId blockEnv''')
+              let g' = ofBlockMap ppId blockEnv'''
               -- pprTrace "g' pre jumps" (ppr g') $ do
               return (mapInsert ppId g' newGraphEnv)
      graphEnv <- foldM add_jumps emptyBlockMap $ mapToList graphEnv
-     let to_proc (bid, (stack_info, g)) = case expectJust "pp label" $ Map.lookup bid procLabels of
+     let to_proc (bid, g) = case expectJust "pp label" $ Map.lookup bid procLabels of
              (lbl, Just info_lbl)
                | bid == entry
                -> CmmProc (TopInfo {info_tbl=info_tbl, stack_info=stack_info})
@@ -462,15 +285,22 @@ splitAtProcPoints entry_label callPPs procPoints procMap
              (lbl, Nothing)
                -> CmmProc (TopInfo {info_tbl=CmmNonInfoTable, stack_info=stack_info})
                           lbl (replacePPIds g)
-         -- References to procpoint IDs can now be replaced with the infotable's label
-         replacePPIds g = mapGraphNodes (id, mapExp repl, mapExp repl) g
+            where
+             stack_info = panic "No StackInfo"
+
+         -- References to procpoint IDs can now be replaced with the
+         -- infotable's label
+         replacePPIds g = {-# SCC "replacePPIds" #-}
+                          mapGraphNodes (id, mapExp repl, mapExp repl) g
            where repl e@(CmmLit (CmmBlock bid)) =
                    case Map.lookup bid procLabels of
                      Just (_, Just info_lbl)  -> CmmLit (CmmLabel info_lbl)
                      _ -> e
                  repl e = e
-     -- The C back end expects to see return continuations before the call sites.
-     -- Here, we sort them in reverse order -- it gets reversed later.
+
+     -- The C back end expects to see return continuations before the
+     -- call sites.  Here, we sort them in reverse order -- it gets
+     -- reversed later.
      let (_, block_order) = foldl add_block_num (0::Int, emptyBlockMap) (postorderDfs g)
          add_block_num (i, map) block = (i+1, mapInsert (entryLabel block) i map)
          sort_fn (bid, _) (bid', _) =
@@ -482,6 +312,27 @@ splitAtProcPoints entry_label callPPs procPoints procMap
             procs
 splitAtProcPoints _ _ _ _ t@(CmmData _ _) = return [t]
 
+
+-- Only called from CmmProcPoint.splitAtProcPoints. NB. does a
+-- recursive lookup, see comment below.
+replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
+replaceBranches env cmmg
+  = {-# SCC "replaceBranches" #-}
+    ofBlockMap (g_entry cmmg) $ mapMap f $ toBlockMap cmmg
+  where
+    f block = replaceLastNode block $ last (lastNode block)
+
+    last :: CmmNode O C -> CmmNode O C
+    last (CmmBranch id)          = CmmBranch (lookup id)
+    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
+    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
+    last l@(CmmCall {})          = l
+    last l@(CmmForeignCall {})   = l
+    lookup id = fmap lookup (mapLookup id env) `orElse` id
+            -- XXX: this is a recursive lookup, it follows chains
+            -- until the lookup returns Nothing, at which point we
+            -- return the last BlockId
+
 ----------------------------------------------------------------
 
 {-
diff --git a/compiler/cmm/CmmRewriteAssignments.hs b/compiler/cmm/CmmRewriteAssignments.hs
index ecf3f7e0c3..cf349a0334 100644
--- a/compiler/cmm/CmmRewriteAssignments.hs
+++ b/compiler/cmm/CmmRewriteAssignments.hs
@@ -18,23 +18,23 @@ module CmmRewriteAssignments
 import Cmm
 import CmmUtils
 import CmmOpt
-import OptimizationFuel
 import StgCmmUtils
 
-import Control.Monad
+import UniqSupply
 import Platform
 import UniqFM
 import Unique
 import BlockId
 
-import Compiler.Hoopl hiding (Unique)
+import Hoopl
 import Data.Maybe
+import Control.Monad
 import Prelude hiding (succ, zip)
 
 ----------------------------------------------------------------
 --- Main function
 
-rewriteAssignments :: Platform -> CmmGraph -> FuelUniqSM CmmGraph
+rewriteAssignments :: Platform -> CmmGraph -> UniqSM CmmGraph
 rewriteAssignments platform g = do
   -- Because we need to act on forwards and backwards information, we
   -- first perform usage analysis and bake this information into the
@@ -213,7 +213,7 @@ usageTransfer = mkBTransfer3 first middle last
           increaseUsage f r = addToUFM_C combine f r SingleUse
             where combine _ _ = ManyUse
 
-usageRewrite :: BwdRewrite FuelUniqSM (WithRegUsage CmmNode) UsageMap
+usageRewrite :: BwdRewrite UniqSM (WithRegUsage CmmNode) UsageMap
 usageRewrite = mkBRewrite3 first middle last
     where first  _ _ = return Nothing
           middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))
@@ -226,7 +226,7 @@ usageRewrite = mkBRewrite3 first middle last
           last   _ _ = return Nothing
 
 type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode)
-annotateUsage :: CmmGraph -> FuelUniqSM (CmmGraphWithRegUsage)
+annotateUsage :: CmmGraph -> UniqSM (CmmGraphWithRegUsage)
 annotateUsage vanilla_g =
     let g = modifyGraph liftRegUsage vanilla_g
     in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $
@@ -404,8 +404,8 @@ clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
 -- ToDo: Also catch MachOp case
 clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
     | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
-clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (CallArea a') o') t)
+clobbers (CmmStackSlot a o, rhs) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot a' o') t)
             = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
           f (CmmLoad e _)    = containsStackSlot e
           f (CmmMachOp _ es) = or (map f es)
@@ -416,9 +416,6 @@ clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
           containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
           containsStackSlot (CmmStackSlot{}) = True
           containsStackSlot _ = False
-clobbers (CmmStackSlot (RegSlot l) _, _) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (RegSlot l') _) _) = l == l'
-          f _ = False
 clobbers _ (_, e) = f e
     where f (CmmLoad (CmmStackSlot _ _) _) = False
           f (CmmLoad{}) = True -- conservative
@@ -432,7 +429,7 @@ clobbers _ (_, e) = f e
 --      [ I32  ]
 --      [    F64     ]
 --      s'   -w'-    o'
-type CallSubArea = (AreaId, Int, Int) -- area, offset, width
+type CallSubArea = (Area, Int, Int) -- area, offset, width
 overlaps :: CallSubArea -> CallSubArea -> Bool
 overlaps (a, _, _) (a', _, _) | a /= a' = False
 overlaps (_, o, w) (_, o', w') =
@@ -457,7 +454,7 @@ invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap
 invalidateVolatile k m = mapUFM p m
   where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize
             where exp CmmLit{} = True
-                  exp (CmmLoad (CmmStackSlot (CallArea (Young k')) _) _)
+                  exp (CmmLoad (CmmStackSlot (Young k') _) _)
                     | k' == k = False
                   exp (CmmLoad (CmmStackSlot _ _) _) = True
                   exp (CmmMachOp _ es) = and (map exp es)
@@ -527,7 +524,7 @@ assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase ass
 -- values from the assignment map, due to reassignment of the local
 -- register.)  This is probably not locally sound.
 
-assignmentRewrite :: FwdRewrite FuelUniqSM (WithRegUsage CmmNode) AssignmentMap
+assignmentRewrite :: FwdRewrite UniqSM (WithRegUsage CmmNode) AssignmentMap
 assignmentRewrite = mkFRewrite3 first middle last
     where
         first _ _ = return Nothing
@@ -596,10 +593,6 @@ assignmentRewrite = mkFRewrite3 first middle last
                           where rep = typeWidth (localRegType r)
               _ -> old
         -- See Note [Soundness of store rewriting]
-        inlineExp assign old@(CmmLoad (CmmStackSlot (RegSlot r) _) _)
-          = case lookupUFM assign r of
-              Just (AlwaysInline x) -> x
-              _ -> old
         inlineExp _ old = old
 
         inlinable :: CmmNode e x -> Bool
@@ -612,7 +605,7 @@ assignmentRewrite = mkFRewrite3 first middle last
 -- in literals, which we can inline more aggressively, and inlining
 -- gives us opportunities for more folding.  However, we don't need any
 -- facts to do MachOp folding.
-machOpFoldRewrite :: Platform -> FwdRewrite FuelUniqSM (WithRegUsage CmmNode) a
+machOpFoldRewrite :: Platform -> FwdRewrite UniqSM (WithRegUsage CmmNode) a
 machOpFoldRewrite platform = mkFRewrite3 first middle last
   where first _ _ = return Nothing
         middle :: WithRegUsage CmmNode O O -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O O
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
deleted file mode 100644
index 9e762fe48a..0000000000
--- a/compiler/cmm/CmmSpillReload.hs
+++ /dev/null
@@ -1,166 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds, FlexibleContexts #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of this flag in due course
-
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
--- TODO: Get rid of this flag:
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
-module CmmSpillReload
-  ( dualLivenessWithInsertion
-  )
-where
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmLive
-import OptimizationFuel
-
-import Control.Monad
-import Outputable hiding (empty)
-import qualified Outputable as PP
-import UniqSet
-
-import Compiler.Hoopl hiding (Unique)
-import Data.Maybe
-import Prelude hiding (succ, zip)
-
-{- Note [Overview of spill/reload]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The point of this module is to insert spills and reloads to establish
-the invariant that at a call or any proc point with an established
-protocol all live variables not expected in registers are sitting on the
-stack.  We use a backward dual liveness analysis (both traditional
-register liveness as well as register slot liveness on the stack) to
-insert spills and reloads.  It should be followed by a forward
-transformation to sink reloads as deeply as possible, so as to reduce
-register pressure: this transformation is performed by
-CmmRewriteAssignments.
-
-A variable can be expected to be live in a register, live on the
-stack, or both.  This analysis ensures that spills and reloads are
-inserted as needed to make sure that every live variable needed
-after a call is available on the stack.  Spills are placed immediately
-after their reaching definitions, but reloads are placed immediately
-after a return from a call (the entry point.)
-
-Note that we offer no guarantees about the consistency of the value
-in memory and the value in the register, except that they are
-equal across calls/procpoints.  If the variable is changed, this
-mapping breaks: but as the original value of the register may still
-be useful in a different context, the memory location is not updated.
--}
-
-data DualLive = DualLive { on_stack :: RegSet, in_regs :: RegSet }
-
-changeStack, changeRegs :: (RegSet -> RegSet) -> DualLive -> DualLive
-changeStack f live = live { on_stack = f (on_stack live) }
-changeRegs  f live = live { in_regs  = f (in_regs  live) }
-
-dualLiveLattice :: DataflowLattice DualLive
-dualLiveLattice = DataflowLattice "variables live in registers and on stack" empty add
-    where empty = DualLive emptyRegSet emptyRegSet
-          add _ (OldFact old) (NewFact new) = (changeIf $ change1 || change2, DualLive stack regs)
-            where (change1, stack) = add1 (on_stack old) (on_stack new)
-                  (change2, regs)  = add1 (in_regs old)  (in_regs new)
-          add1 old new = if sizeUniqSet join > sizeUniqSet old then (True, join) else (False, old)
-            where join = unionUniqSets old new
-
-dualLivenessWithInsertion :: BlockSet -> CmmGraph -> FuelUniqSM CmmGraph
-dualLivenessWithInsertion procPoints g =
-  liftM fst $ dataflowPassBwd g [] $ analRewBwd dualLiveLattice
-                                                (dualLiveTransfers (g_entry g) procPoints)
-                                                (insertSpillsAndReloads g procPoints)
-
--- Note [Live registers on entry to procpoints]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- Remember that the transfer function is only ever run on the rewritten
--- version of a graph, and the rewrite function for spills and reloads
--- enforces the invariant that no local registers are live on entry to
--- a procpoint.  Accordingly, we check for this invariant here.  An old
--- version of this code incorrectly claimed that any live registers were
--- live on the stack before entering the function: this is wrong, but
--- didn't cause bugs because it never actually was invoked.
-
-dualLiveTransfers :: BlockId -> BlockSet -> (BwdTransfer CmmNode DualLive)
-dualLiveTransfers entry procPoints = mkBTransfer3 first middle last
-    where first :: CmmNode C O -> DualLive -> DualLive
-          first (CmmEntry id) live -- See Note [Live registers on entry to procpoints]
-            | id == entry || setMember id procPoints = noLiveOnEntry id (in_regs live) live
-            | otherwise                              = live
-
-          middle :: CmmNode O O -> DualLive -> DualLive
-          middle m = changeStack updSlots
-                   . changeRegs  updRegs
-            where -- Reuse middle of liveness analysis from CmmLive
-                  updRegs = case getBTransfer3 xferLive of (_, middle, _) -> middle m
-
-                  updSlots live = foldSlotsUsed reload (foldSlotsDefd spill live m) m
-                  spill  live s@(RegSlot r, _, _) = check s $ deleteFromRegSet live r
-                  spill  live _ = live
-                  reload live s@(RegSlot r, _, _) = check s $ extendRegSet live r
-                  reload live _ = live
-                  -- Ensure the assignment refers to the entirety of the
-                  -- register slot (and not just a slice).
-                  check (RegSlot (LocalReg _ ty), o, w) x
-                     | o == w && w == widthInBytes (typeWidth ty) = x
-                  check _ _ = panic "dualLiveTransfers: slices unsupported"
-
-          -- Register analysis is identical to liveness analysis from CmmLive.
-          last :: CmmNode O C -> FactBase DualLive -> DualLive
-          last l fb = changeRegs (gen_kill l) $ case l of
-            CmmCall {cml_cont=Nothing} -> empty
-            CmmCall {cml_cont=Just k}  -> keep_stack_only k
-            CmmForeignCall {succ=k}    -> keep_stack_only k
-            _                          -> joinOutFacts dualLiveLattice l fb
-            where empty = fact_bot dualLiveLattice
-                  lkp k = fromMaybe empty (lookupFact k fb)
-                  keep_stack_only k = DualLive (on_stack (lkp k)) emptyRegSet
-
-insertSpillsAndReloads :: CmmGraph -> BlockSet -> CmmBwdRewrite DualLive
-insertSpillsAndReloads graph procPoints = deepBwdRw3 first middle nothing
-  -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-  -- but GHC miscompiles it, see bug #4044.
-    where first :: CmmNode C O -> Fact O DualLive -> CmmReplGraph C O
-          first e@(CmmEntry id) live = return $
-            if id /= (g_entry graph) && setMember id procPoints then
-              case map reload (uniqSetToList (in_regs live)) of
-                [] -> Nothing
-                is -> Just $ mkFirst e <*> mkMiddles is
-            else Nothing
-          -- EZY: There was some dead code for handling the case where
-          -- we were not splitting procedures.  Check Git history if
-          -- you're interested (circa e26ea0f41).
-
-          middle :: CmmNode O O -> Fact O DualLive -> CmmReplGraph O O
-          -- Don't add spills next to reloads.
-          middle (CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg') _) _)) _ | reg == reg' = return Nothing
-          -- Spill if register is live on stack.
-          middle m@(CmmAssign (CmmLocal reg) _) live
-            | reg `elemRegSet` on_stack live = return (Just (mkMiddles [m, spill reg]))
-          middle _ _ = return Nothing
-
-          nothing _ _ = return Nothing
-
-spill, reload :: LocalReg -> CmmNode O O
-spill  r = CmmStore  (regSlot r) (CmmReg $ CmmLocal r)
-reload r = CmmAssign (CmmLocal r) (CmmLoad (regSlot r) $ localRegType r)
-
----------------------
--- prettyprinting
-
-ppr_regs :: String -> RegSet -> SDoc
-ppr_regs s regs = text s <+> commafy (map ppr $ uniqSetToList regs)
-  where commafy xs = hsep $ punctuate comma xs
-
-instance Outputable DualLive where
-  ppr (DualLive {in_regs = regs, on_stack = stack}) =
-      if isEmptyUniqSet regs && isEmptyUniqSet stack then
-          text "<nothing-live>"
-      else
-          nest 2 $ fsep [if isEmptyUniqSet regs then PP.empty
-                         else (ppr_regs "live in regs =" regs),
-                         if isEmptyUniqSet stack then PP.empty
-                         else (ppr_regs "live on stack =" stack)]
diff --git a/compiler/cmm/CmmStackLayout.hs b/compiler/cmm/CmmStackLayout.hs
index c7fedad05b..726f98e8a3 100644
--- a/compiler/cmm/CmmStackLayout.hs
+++ b/compiler/cmm/CmmStackLayout.hs
@@ -35,7 +35,6 @@ import CmmProcPoint
 import Maybes
 import MkGraph (stackStubExpr)
 import Control.Monad
-import OptimizationFuel
 import Outputable
 import SMRep (ByteOff)
 
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 0756c87583..f2e4d8e183 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -60,13 +60,14 @@ module CmmUtils(
         -- * Operations that probably don't belong here
         modifyGraph,
 
-        lastNode, replaceLastNode, insertBetween,
+        lastNode, replaceLastNode,
         ofBlockMap, toBlockMap, insertBlock,
         ofBlockList, toBlockList, bodyToBlockList,
         foldGraphBlocks, mapGraphNodes, postorderDfs, mapGraphNodes1,
       
         analFwd, analBwd, analRewFwd, analRewBwd,
-        dataflowPassFwd, dataflowPassBwd
+        dataflowPassFwd, dataflowPassBwd, dataflowAnalFwd, dataflowAnalBwd,
+        dataflowAnalFwdBlocks
   ) where
 
 #include "HsVersions.h"
@@ -79,7 +80,6 @@ import Cmm
 import BlockId
 import CLabel
 import Outputable
-import OptimizationFuel as F
 import Unique
 import UniqSupply
 import Constants( wORD_SIZE, tAG_MASK )
@@ -88,8 +88,7 @@ import Util
 import Data.Word
 import Data.Maybe
 import Data.Bits
-import Control.Monad
-import Compiler.Hoopl hiding ( Unique )
+import Hoopl
 
 ---------------------------------------------------
 --
@@ -402,13 +401,13 @@ mkLiveness (reg:regs)
 modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
 modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
 
-toBlockMap :: CmmGraph -> LabelMap CmmBlock
+toBlockMap :: CmmGraph -> BlockEnv CmmBlock
 toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 
-ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
+ofBlockMap :: BlockId -> BlockEnv CmmBlock -> CmmGraph
 ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
 
-insertBlock :: CmmBlock -> LabelMap CmmBlock -> LabelMap CmmBlock
+insertBlock :: CmmBlock -> BlockEnv CmmBlock -> BlockEnv CmmBlock
 insertBlock block map =
   ASSERT (isNothing $ mapLookup id map)
   mapInsert id block map
@@ -418,7 +417,8 @@ toBlockList :: CmmGraph -> [CmmBlock]
 toBlockList g = mapElems $ toBlockMap g
 
 ofBlockList :: BlockId -> [CmmBlock] -> CmmGraph
-ofBlockList entry blocks = CmmGraph {g_entry=entry, g_graph=GMany NothingO body NothingO}
+ofBlockList entry blocks = CmmGraph { g_entry = entry
+                                    , g_graph = GMany NothingO body NothingO }
   where body = foldr addBlock emptyBody blocks
 
 bodyToBlockList :: Body CmmNode -> [CmmBlock]
@@ -429,97 +429,77 @@ mapGraphNodes :: ( CmmNode C O -> CmmNode C O
                  , CmmNode O C -> CmmNode O C)
               -> CmmGraph -> CmmGraph
 mapGraphNodes funs@(mf,_,_) g =
-  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (blockMapNodes3 funs) $ toBlockMap g
+  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (mapBlock3' funs) $ toBlockMap g
 
 mapGraphNodes1 :: (forall e x. CmmNode e x -> CmmNode e x) -> CmmGraph -> CmmGraph
-mapGraphNodes1 f g = modifyGraph (graphMapBlocks (blockMapNodes f)) g
+mapGraphNodes1 f = modifyGraph (mapGraph f)
 
 
 foldGraphBlocks :: (CmmBlock -> a -> a) -> a -> CmmGraph -> a
 foldGraphBlocks k z g = mapFold k z $ toBlockMap g
 
 postorderDfs :: CmmGraph -> [CmmBlock]
-postorderDfs g = postorder_dfs_from (toBlockMap g) (g_entry g)
-
--------------------------------------------------
--- Manipulating CmmBlocks
-
-lastNode :: CmmBlock -> CmmNode O C
-lastNode block = foldBlockNodesF3 (nothing, nothing, const) block ()
-  where nothing :: a -> b -> ()
-        nothing _ _ = ()
-
-replaceLastNode :: Block CmmNode e C -> CmmNode O C -> Block CmmNode e C
-replaceLastNode block last = blockOfNodeList (first, middle, JustC last)
-  where (first, middle, _) = blockToNodeList block
-
-----------------------------------------------------------------------
------ Splicing between blocks
--- Given a middle node, a block, and a successor BlockId,
--- we can insert the middle node between the block and the successor.
--- We return the updated block and a list of new blocks that must be added
--- to the graph.
--- The semantics is a bit tricky. We consider cases on the last node:
--- o For a branch, we can just insert before the branch,
---   but sometimes the optimizer does better if we actually insert
---   a fresh basic block, enabling some common blockification.
--- o For a conditional branch, switch statement, or call, we must insert
---   a new basic block.
--- o For a jump or return, this operation is impossible.
-
-insertBetween :: MonadUnique m => CmmBlock -> [CmmNode O O] -> BlockId -> m (CmmBlock, [CmmBlock])
-insertBetween b ms succId = insert $ lastNode b
-  where insert :: MonadUnique m => CmmNode O C -> m (CmmBlock, [CmmBlock])
-        insert (CmmBranch bid) =
-          if bid == succId then
-            do (bid', bs) <- newBlocks
-               return (replaceLastNode b (CmmBranch bid'), bs)
-          else panic "tried invalid block insertBetween"
-        insert (CmmCondBranch c t f) =
-          do (t', tbs) <- if t == succId then newBlocks else return $ (t, [])
-             (f', fbs) <- if f == succId then newBlocks else return $ (f, [])
-             return (replaceLastNode b (CmmCondBranch c t' f'), tbs ++ fbs)
-        insert (CmmSwitch e ks) =
-          do (ids, bs) <- mapAndUnzipM mbNewBlocks ks
-             return (replaceLastNode b (CmmSwitch e ids), join bs)
-        insert (CmmCall {}) =
-          panic "unimp: insertBetween after a call -- probably not a good idea"
-        insert (CmmForeignCall {}) =
-          panic "unimp: insertBetween after a foreign call -- probably not a good idea"
-
-        newBlocks :: MonadUnique m => m (BlockId, [CmmBlock])
-        newBlocks = do id <- liftM mkBlockId $ getUniqueM
-                       return $ (id, [blockOfNodeList (JustC (CmmEntry id), ms, JustC (CmmBranch succId))])
-        mbNewBlocks :: MonadUnique m => Maybe BlockId -> m (Maybe BlockId, [CmmBlock])
-        mbNewBlocks (Just k) = if k == succId then liftM fstJust newBlocks
-                               else return (Just k, [])
-        mbNewBlocks Nothing  = return (Nothing, [])
-        fstJust (id, bs) = (Just id, bs)
+postorderDfs g = {-# SCC "postorderDfs" #-} postorder_dfs_from (toBlockMap g) (g_entry g)
 
 -------------------------------------------------
 -- Running dataflow analysis and/or rewrites
 
 -- Constructing forward and backward analysis-only pass
-analFwd    :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdPass m n f
-analBwd    :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdPass m n f
+analFwd    :: DataflowLattice f -> FwdTransfer n f -> FwdPass UniqSM n f
+analBwd    :: DataflowLattice f -> BwdTransfer n f -> BwdPass UniqSM n f
 
 analFwd lat xfer = analRewFwd lat xfer noFwdRewrite
 analBwd lat xfer = analRewBwd lat xfer noBwdRewrite
 
 -- Constructing forward and backward analysis + rewrite pass
-analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdRewrite m n f -> FwdPass m n f
-analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdRewrite m n f -> BwdPass m n f
+analRewFwd :: DataflowLattice f -> FwdTransfer n f
+           -> FwdRewrite UniqSM n f
+           -> FwdPass UniqSM n f
+
+analRewBwd :: DataflowLattice f
+           -> BwdTransfer n f
+           -> BwdRewrite UniqSM n f
+           -> BwdPass UniqSM n f
 
 analRewFwd lat xfer rew = FwdPass {fp_lattice = lat, fp_transfer = xfer, fp_rewrite = rew}
 analRewBwd lat xfer rew = BwdPass {bp_lattice = lat, bp_transfer = xfer, bp_rewrite = rew}
 
 -- Running forward and backward dataflow analysis + optional rewrite
-dataflowPassFwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> FwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowPassFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
 
-dataflowPassBwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> BwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowAnalFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd =
+  analyzeFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+
+dataflowAnalFwdBlocks :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (BlockEnv f)
+dataflowAnalFwdBlocks (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
+--  (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+--  return facts
+  return (analyzeFwdBlocks fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts))
+
+dataflowAnalBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd =
+  analyzeBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
+
+dataflowPassBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
diff --git a/compiler/cmm/Hoopl.hs b/compiler/cmm/Hoopl.hs
new file mode 100644
index 0000000000..0eca85cb8a
--- /dev/null
+++ b/compiler/cmm/Hoopl.hs
@@ -0,0 +1,125 @@
+module Hoopl (
+    module Compiler.Hoopl,
+    module Hoopl.Dataflow,
+    deepFwdRw, deepFwdRw3,
+    deepBwdRw, deepBwdRw3,
+    thenFwdRw
+  ) where
+
+import Compiler.Hoopl hiding
+  ( Unique,
+    FwdTransfer(..), FwdRewrite(..), FwdPass(..),
+    BwdTransfer(..), BwdRewrite(..), BwdPass(..),
+    noFwdRewrite, noBwdRewrite,
+    analyzeAndRewriteFwd, analyzeAndRewriteBwd,
+    mkFactBase, Fact,
+    mkBRewrite, mkBRewrite3, mkBTransfer, mkBTransfer3,
+    mkFRewrite, mkFRewrite3, mkFTransfer, mkFTransfer3,
+    deepFwdRw, deepFwdRw3, thenFwdRw, iterFwdRw,
+    deepBwdRw, deepBwdRw3, thenBwdRw, iterBwdRw
+  )
+
+import Hoopl.Dataflow
+import Control.Monad
+import UniqSupply
+
+deepFwdRw3 :: (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+           -> (FwdRewrite UniqSM n f)
+deepFwdRw :: (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x))) -> FwdRewrite UniqSM n f
+deepFwdRw3 f m l = iterFwdRw $ mkFRewrite3 f m l
+deepFwdRw f = deepFwdRw3 f f f
+
+-- N.B. rw3, rw3', and rw3a are triples of functions.
+-- But rw and rw' are single functions.
+thenFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f 
+          -> FwdRewrite UniqSM n f
+thenFwdRw rw3 rw3' = wrapFR2 thenrw rw3 rw3'
+ where
+  thenrw :: forall e x t t1.
+               (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> t
+            -> t1
+            -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+  thenrw rw rw' n f = rw n f >>= fwdRes
+     where fwdRes Nothing   = rw' n f
+           fwdRes (Just gr) = return $ Just $ fadd_rw rw3' gr
+
+iterFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f
+iterFwdRw rw3 = wrapFR iter rw3
+ where iter :: forall a e x t.
+               (t -> a -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+               -> t
+               -> a
+               -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+       iter rw n = (liftM $ liftM $ fadd_rw (iterFwdRw rw3)) . rw n
+
+-- | Function inspired by 'rew' in the paper
+_frewrite_cps :: ((Graph n e x, FwdRewrite UniqSM n f) -> UniqSM a)
+             -> UniqSM a
+             -> (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+             -> n e x
+             -> f
+             -> UniqSM a
+_frewrite_cps j n rw node f =
+    do mg <- rw node f
+       case mg of Nothing -> n
+                  Just gr -> j gr
+
+
+
+-- | Function inspired by 'add' in the paper
+fadd_rw :: FwdRewrite UniqSM n f
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+fadd_rw rw2 (g, rw1) = (g, rw1 `thenFwdRw` rw2)
+
+
+
+deepBwdRw3 ::
+              (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+           -> (BwdRewrite UniqSM n f)
+deepBwdRw  :: (forall e x . n e x -> Fact x f -> UniqSM (Maybe (Graph n e x)))
+           -> BwdRewrite UniqSM n f
+deepBwdRw3 f m l = iterBwdRw $ mkBRewrite3 f m l
+deepBwdRw  f = deepBwdRw3 f f f
+
+
+thenBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+thenBwdRw rw1 rw2 = wrapBR2 f rw1 rw2
+  where f :: forall t t1 t2 e x.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw1 rw2' n f = do
+          res1 <- rw1 n f
+          case res1 of
+            Nothing -> rw2' n f
+            Just gr -> return $ Just $ badd_rw rw2 gr
+
+iterBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+iterBwdRw rw = wrapBR f rw
+  where f :: forall t e x t1 t2.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw' n f = liftM (liftM (badd_rw (iterBwdRw rw))) (rw' n f)
+
+-- | Function inspired by 'add' in the paper
+badd_rw :: BwdRewrite UniqSM n f
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+badd_rw rw2 (g, rw1) = (g, rw1 `thenBwdRw` rw2)
diff --git a/compiler/cmm/Hoopl/Dataflow.hs b/compiler/cmm/Hoopl/Dataflow.hs
new file mode 100644
index 0000000000..9745eac9d8
--- /dev/null
+++ b/compiler/cmm/Hoopl/Dataflow.hs
@@ -0,0 +1,887 @@
+--
+-- Copyright (c) 2010, João Dias, Simon Marlow, Simon Peyton Jones,
+-- and Norman Ramsey
+--
+-- Modifications copyright (c) The University of Glasgow 2012
+--
+-- This module is a specialised and optimised version of
+-- Compiler.Hoopl.Dataflow in the hoopl package.  In particular it is
+-- specialised to the UniqSM monad.
+--
+
+{-# LANGUAGE RankNTypes, ScopedTypeVariables, GADTs, EmptyDataDecls, PatternGuards, TypeFamilies, MultiParamTypeClasses #-}
+#if __GLASGOW_HASKELL__ >= 703
+{-# OPTIONS_GHC -fprof-auto-top #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 701
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+module Hoopl.Dataflow
+  ( DataflowLattice(..), OldFact(..), NewFact(..), Fact, mkFactBase
+  , ChangeFlag(..)
+  , FwdPass(..), FwdTransfer, mkFTransfer, mkFTransfer3, getFTransfer3
+  -- * Respecting Fuel
+
+  -- $fuel
+  , FwdRewrite,  mkFRewrite,  mkFRewrite3,  getFRewrite3, noFwdRewrite
+  , wrapFR, wrapFR2
+  , BwdPass(..), BwdTransfer, mkBTransfer, mkBTransfer3, getBTransfer3
+  , wrapBR, wrapBR2
+  , BwdRewrite,  mkBRewrite,  mkBRewrite3,  getBRewrite3, noBwdRewrite
+  , analyzeAndRewriteFwd,  analyzeAndRewriteBwd
+  , analyzeFwd, analyzeFwdBlocks, analyzeBwd
+  )
+where
+
+import UniqSupply
+
+import Data.Maybe
+import Data.Array
+
+import Compiler.Hoopl hiding
+   ( mkBRewrite3, mkFRewrite3, noFwdRewrite, noBwdRewrite
+   , analyzeAndRewriteBwd, analyzeAndRewriteFwd
+   )
+import Compiler.Hoopl.Internals
+  ( wrapFR, wrapFR2
+  , wrapBR, wrapBR2
+  , splice
+  )
+
+
+-- -----------------------------------------------------------------------------
+
+noRewrite :: a -> b -> UniqSM (Maybe c)
+noRewrite _ _ = return Nothing
+
+noFwdRewrite :: FwdRewrite UniqSM n f
+noFwdRewrite = FwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+-- | Functions passed to 'mkFRewrite3' should not be aware of the fuel supply.
+-- The result returned by 'mkFRewrite3' respects fuel.
+mkFRewrite3 :: forall n f.
+               (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+            -> FwdRewrite UniqSM n f
+mkFRewrite3 f m l = FwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, FwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noFwdRewrite))
+
+noBwdRewrite :: BwdRewrite UniqSM n f
+noBwdRewrite = BwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+mkBRewrite3 :: forall n f.
+               (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+            -> BwdRewrite UniqSM n f
+mkBRewrite3 f m l = BwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, BwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noBwdRewrite))
+
+-----------------------------------------------------------------------------
+--              Analyze and rewrite forward: the interface
+-----------------------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeAndRewriteFwd
+   :: forall n f e x .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n  e x -> Fact e f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO x f)
+analyzeAndRewriteFwd pass entries g f =
+  do (rg, fout) <- arfGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedExitFact g' fout)
+
+distinguishedExitFact :: forall n e x f . Graph n e x -> Fact x f -> MaybeO x f
+distinguishedExitFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO x f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany _ _ x) = case x of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+----------------------------------------------------------------
+--       Forward Implementation
+----------------------------------------------------------------
+
+type Entries e = MaybeC e [Label]
+
+arfGraph :: forall n f e x .  NonLocal n =>
+            FwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+arfGraph pass@FwdPass { fp_lattice = lattice,
+                        fp_transfer = transfer,
+                        fp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARF  thing = forall e x . thing e x -> f        -> m (DG f n e x, Fact x f)
+    type ARFX thing = forall e x . thing e x -> Fact e f -> m (DG f n e x, Fact x f)
+    -}
+    graph ::              Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+    block :: forall e x .
+             Block n e x -> f -> UniqSM (DG f n e x, Fact x f)
+
+    body  :: [Label] -> LabelMap (Block n C C)
+          -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+
+    cat :: forall e a x f1 f2 f3.
+           (f1 -> UniqSM (DG f n e a, f2))
+        -> (f2 -> UniqSM (DG f n a x, f3))
+        -> (f1 -> UniqSM (DG f n e x, f3))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+      exit  :: MaybeO x (Block n C O)           -> Fact C f -> UniqSM (DG f n C x, Fact x f)
+      exit (JustO blk) f = arfx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` node n) f
+    block (BTail n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node :: forall e x . (ShapeLifter e x)
+         => n e x -> f -> UniqSM (DG f n e x, Fact x f)
+    node n f
+     = do { grw <- frewrite rewrite n f
+          ; case grw of
+              Nothing -> return ( singletonDG f n
+                                , ftransfer transfer n f )
+              Just (g, rw) ->
+                  let pass' = pass { fp_rewrite = rw }
+                      f'    = fwdEntryFact n f
+                  in  arfGraph pass' (fwdEntryLabel n) g f' }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g1,f1) <- ft1 f
+                       ; (g2,f2) <- ft2 f1
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f2) }
+
+    arfx :: forall x .
+            (Block n C x ->        f -> UniqSM (DG f n C x, Fact x f))
+         -> (Block n C x -> Fact C f -> UniqSM (DG f n C x, Fact x f))
+    arfx arf thing fb = 
+      arf thing $ fromJust $ lookupFact (entryLabel thing) $ joinInFacts lattice fb
+     -- joinInFacts adds debugging information
+
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Fwd lattice do_block entries blockmap init_fbase
+      where
+        lattice = fp_lattice pass
+        do_block :: forall x . Block n C x -> FactBase f
+                 -> UniqSM (DG f n C x, Fact x f)
+        do_block b fb = block b entryFact
+          where entryFact = getFact lattice (entryLabel b) fb
+
+
+-- Join all the incoming facts with bottom.
+-- We know the results _shouldn't change_, but the transfer
+-- functions might, for example, generate some debugging traces.
+joinInFacts :: DataflowLattice f -> FactBase f -> FactBase f
+joinInFacts (lattice @ DataflowLattice {fact_bot = bot, fact_join = fj}) fb =
+  mkFactBase lattice $ map botJoin $ mapToList fb
+    where botJoin (l, f) = (l, snd $ fj l (OldFact bot) (NewFact f))
+
+forwardBlockList :: (NonLocal n)
+                 => [Label] -> Body n -> [Block n C C]
+-- This produces a list of blocks in order suitable for forward analysis,
+-- along with the list of Labels it may depend on for facts.
+forwardBlockList entries blks = postorder_dfs_from blks entries
+
+----------------------------------------------------------------
+--       Forward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwd
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwd FwdPass { fp_lattice = lattice,
+                     fp_transfer = FwdTransfer3 (ftr, mtr, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = (ftr l `cat` (block b `cat` ltr n)) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` mtr n) f
+    block (BTail n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwdBlocks
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwdBlocks FwdPass { fp_lattice = lattice,
+                           fp_transfer = FwdTransfer3 (ftr, _, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n _)   f = ftr n f
+    block (BlockCC l _ n) f = (ftr l `cat` ltr n) f
+    block (BlockOC   _ n) f = ltr n f
+    block _               _ = error "analyzeFwdBlocks"
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+----------------------------------------------------------------
+--       Backward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeBwd
+   :: forall n f e .  NonLocal n =>
+      BwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact C f
+   -> FactBase f
+analyzeBwd BwdPass { bp_lattice = lattice,
+                     bp_transfer = BwdTransfer3 (ftr, mtr, ltr) }
+   entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact C f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Bwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> Fact x f -> FactBase f
+           do_block b fb = mapSingleton (entryLabel b) (block b fb)
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> Fact x f -> f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = ((ftr l `cat` block b) `cat` ltr n) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` mtr n) f
+    block (BTail n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f2 -> f3) -> (f1 -> f2) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft1 $! ft2 f
+
+-----------------------------------------------------------------------------
+--              Backward analysis and rewriting: the interface
+-----------------------------------------------------------------------------
+
+
+-- | if the graph being analyzed is open at the exit, I don't
+--   quite understand the implications of possible other exits
+analyzeAndRewriteBwd
+   :: NonLocal n
+   => BwdPass UniqSM n f
+   -> MaybeC e [Label] -> Graph n e x -> Fact x f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO e f)
+analyzeAndRewriteBwd pass entries g f =
+  do (rg, fout) <- arbGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedEntryFact g' fout)
+
+distinguishedEntryFact :: forall n e x f . Graph n e x -> Fact e f -> MaybeO e f
+distinguishedEntryFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO e f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany e _ _) = case e of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+
+-----------------------------------------------------------------------------
+--              Backward implementation
+-----------------------------------------------------------------------------
+
+arbGraph :: forall n f e x .
+            NonLocal n =>
+            BwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+arbGraph pass@BwdPass { bp_lattice  = lattice,
+                        bp_transfer = transfer,
+                        bp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARB  thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, f)
+    type ARBX thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, Fact e f)
+    -}
+    graph ::              Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+    block :: forall e x . Block n e x -> Fact x f -> UniqSM (DG f n e x, f)
+    body  :: [Label] -> Body n -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+    node  :: forall e x . (ShapeLifter e x) 
+             => n e x       -> Fact x f -> UniqSM (DG f n e x, f)
+    cat :: forall e a x info info' info''.
+           (info' -> UniqSM (DG f n e a, info''))
+        -> (info  -> UniqSM (DG f n a x, info'))
+        -> (info  -> UniqSM (DG f n e x, info''))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+      exit  :: MaybeO x (Block n C O)           -> Fact x f -> UniqSM (DG f n C x, Fact C f)
+      exit (JustO blk) f = arbx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` node n) f
+    block (BTail n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node n f
+      = do { bwdres <- brewrite rewrite n f
+           ; case bwdres of
+               Nothing -> return (singletonDG entry_f n, entry_f)
+                            where entry_f = btransfer transfer n f
+               Just (g, rw) ->
+                          do { let pass' = pass { bp_rewrite = rw }
+                             ; (g, f) <- arbGraph pass' (fwdEntryLabel n) g f
+                             ; return (g, bwdEntryFact lattice n f)} }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g2,f2) <- ft2 f
+                       ; (g1,f1) <- ft1 f2
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f1) }
+
+    arbx :: forall x .
+            (Block n C x -> Fact x f -> UniqSM (DG f n C x, f))
+         -> (Block n C x -> Fact x f -> UniqSM (DG f n C x, Fact C f))
+
+    arbx arb thing f = do { (rg, f) <- arb thing f
+                          ; let fb = joinInFacts (bp_lattice pass) $
+                                     mapSingleton (entryLabel thing) f
+                          ; return (rg, fb) }
+     -- joinInFacts adds debugging information
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Bwd (bp_lattice pass) do_block entries blockmap init_fbase
+      where
+        do_block :: forall x. Block n C x -> Fact x f -> UniqSM (DG f n C x, LabelMap f)
+        do_block b f = do (g, f) <- block b f
+                          return (g, mapSingleton (entryLabel b) f)
+
+
+{-
+
+The forward and backward cases are not dual.  In the forward case, the
+entry points are known, and one simply traverses the body blocks from
+those points.  In the backward case, something is known about the exit
+points, but this information is essentially useless, because we don't
+actually have a dual graph (that is, one with edges reversed) to
+compute with.  (Even if we did have a dual graph, it would not avail
+us---a backward analysis must include reachable blocks that don't
+reach the exit, as in a procedure that loops forever and has side
+effects.)
+
+-}
+
+-----------------------------------------------------------------------------
+--      fixpoint
+-----------------------------------------------------------------------------
+
+data Direction = Fwd | Bwd
+
+-- | fixpointing for analysis-only
+--
+fixpointAnal :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> Fact C f)
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> Fact C f -> FactBase f
+
+fixpointAnal direction DataflowLattice{ fact_bot = _, fact_join = join }
+              do_block entries blockmap init_fbase
+  = loop start init_fbase
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap      -- blocks still to analyse
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> FactBase f
+
+    loop []        fbase = fbase
+    loop (ix:todo) fbase =
+           let
+               blk = block_arr ! ix
+
+               out_facts = {-# SCC "do_block" #-} do_block blk fbase
+
+               !(todo', fbase') = {-# SCC "mapFoldWithKey" #-}
+                     mapFoldWithKey (updateFact join dep_blocks)
+                                    (todo,fbase) out_facts
+           in
+           -- trace ("analysing: " ++ show (entryLabel blk)) $
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           loop todo' fbase'
+
+
+-- | fixpointing for combined analysis/rewriting
+--
+fixpoint :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> UniqSM (DG f n C C, Fact C f))
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> (Fact C f -> UniqSM (DG f n C C, Fact C f))
+
+fixpoint direction DataflowLattice{ fact_bot = _, fact_join = join }
+         do_block entries blockmap init_fbase
+  = do
+        -- trace ("fixpoint: " ++ show (case direction of Fwd -> True; Bwd -> False) ++ " " ++ show (mapKeys blockmap) ++ show entries ++ " " ++ show (mapKeys init_fbase)) $ return()
+        (fbase, newblocks) <- loop start init_fbase mapEmpty
+        -- trace ("fixpoint DONE: " ++ show (mapKeys fbase) ++ show (mapKeys newblocks)) $ return()
+        return (GMany NothingO newblocks NothingO,
+                mapDeleteList (mapKeys blockmap) fbase)
+    -- The successors of the Graph are the the Labels
+    -- for which we have facts and which are *not* in
+    -- the blocks of the graph
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> LabelMap (DBlock f n C C)  -- transformed graph
+       -> UniqSM (FactBase f, LabelMap (DBlock f n C C))
+
+    loop [] fbase newblocks = return (fbase, newblocks)
+    loop (ix:todo) fbase !newblocks = do
+           let blk = block_arr ! ix
+
+           -- trace ("analysing: " ++ show (entryLabel blk)) $ return ()
+           (rg, out_facts) <- do_block blk fbase
+           let !(todo', fbase') =
+                  mapFoldWithKey (updateFact join dep_blocks)
+                                 (todo,fbase) out_facts
+
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           let newblocks' = case rg of
+                              GMany _ blks _ -> mapUnion blks newblocks
+     
+           loop todo' fbase' newblocks'
+
+
+{-  Note [TxFactBase invariants]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The TxFactBase is used only during a fixpoint iteration (or "sweep"),
+and accumulates facts (and the transformed code) during the fixpoint
+iteration.
+
+* tfb_fbase increases monotonically, across all sweeps
+
+* At the beginning of each sweep
+      tfb_cha  = NoChange
+      tfb_lbls = {}
+
+* During each sweep we process each block in turn.  Processing a block
+  is done thus:
+    1.  Read from tfb_fbase the facts for its entry label (forward)
+        or successors labels (backward)
+    2.  Transform those facts into new facts for its successors (forward)
+        or entry label (backward)
+    3.  Augment tfb_fbase with that info
+  We call the labels read in step (1) the "in-labels" of the sweep
+
+* The field tfb_lbls is the set of in-labels of all blocks that have
+  been processed so far this sweep, including the block that is
+  currently being processed.  tfb_lbls is initialised to {}.  It is a
+  subset of the Labels of the *original* (not transformed) blocks.
+
+* The tfb_cha field is set to SomeChange iff we decide we need to
+  perform another iteration of the fixpoint loop. It is initialsed to NoChange.
+
+  Specifically, we set tfb_cha to SomeChange in step (3) iff
+    (a) The fact in tfb_fbase for a block L changes
+    (b) L is in tfb_lbls
+  Reason: until a label enters the in-labels its accumuated fact in tfb_fbase
+  has not been read, hence cannot affect the outcome
+
+Note [Unreachable blocks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+A block that is not in the domain of tfb_fbase is "currently unreachable".
+A currently-unreachable block is not even analyzed.  Reason: consider 
+constant prop and this graph, with entry point L1:
+  L1: x:=3; goto L4
+  L2: x:=4; goto L4
+  L4: if x>3 goto L2 else goto L5
+Here L2 is actually unreachable, but if we process it with bottom input fact,
+we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
+
+* If a currently-unreachable block is not analyzed, then its rewritten
+  graph will not be accumulated in tfb_rg.  And that is good:
+  unreachable blocks simply do not appear in the output.
+
+* Note that clients must be careful to provide a fact (even if bottom)
+  for each entry point. Otherwise useful blocks may be garbage collected.
+
+* Note that updateFact must set the change-flag if a label goes from
+  not-in-fbase to in-fbase, even if its fact is bottom.  In effect the
+  real fact lattice is
+       UNR
+       bottom
+       the points above bottom
+
+* Even if the fact is going from UNR to bottom, we still call the
+  client's fact_join function because it might give the client
+  some useful debugging information.
+
+* All of this only applies for *forward* ixpoints.  For the backward
+  case we must treat every block as reachable; it might finish with a
+  'return', and therefore have no successors, for example.
+-}
+
+
+-----------------------------------------------------------------------------
+--  Pieces that are shared by fixpoint and fixpoint_anal
+-----------------------------------------------------------------------------
+
+-- | Sort the blocks into the right order for analysis.
+sortBlocks :: NonLocal n => Direction -> [Label] -> LabelMap (Block n C C)
+           -> [Block n C C]
+sortBlocks direction entries blockmap
+   = case direction of Fwd -> fwd
+                       Bwd -> reverse fwd
+  where fwd = forwardBlockList entries blockmap
+
+-- | construct a mapping from L -> block indices.  If the fact for L
+-- changes, re-analyse the given blocks.
+mkDepBlocks :: NonLocal n => Direction -> [Block n C C] -> LabelMap [Int]
+mkDepBlocks Fwd blocks = go blocks 0 mapEmpty
+  where go []     !_  m = m
+        go (b:bs) !n m = go bs (n+1) $! mapInsert (entryLabel b) [n] m
+mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
+  where go []     !_ m = m
+        go (b:bs) !n m = go bs (n+1) $! go' (successors b) m
+            where go' [] m = m
+                  go' (l:ls) m = go' ls (mapInsertWith (++) l [n] m)
+
+
+-- | After some new facts have been generated by analysing a block, we
+-- fold this function over them to generate (a) a list of block
+-- indices to (re-)analyse, and (b) the new FactBase.
+--
+updateFact :: JoinFun f -> LabelMap [Int]
+           -> Label -> f       -- out fact
+           -> (IntHeap, FactBase f)
+           -> (IntHeap, FactBase f)
+
+updateFact fact_join dep_blocks lbl new_fact (todo, fbase)
+  = case lookupFact lbl fbase of
+      Nothing       -> let !z = mapInsert lbl new_fact fbase in (changed, z)
+                           -- Note [no old fact]
+      Just old_fact ->
+        case fact_join lbl (OldFact old_fact) (NewFact new_fact) of
+          (NoChange, _) -> (todo, fbase)
+          (_,        f) -> let !z = mapInsert lbl f fbase in (changed, z)
+  where
+     changed = foldr insertIntHeap todo $
+                 mapFindWithDefault [] lbl dep_blocks
+
+{-
+Note [no old fact]
+
+We know that the new_fact is >= _|_, so we don't need to join.  However,
+if the new fact is also _|_, and we have already analysed its block,
+we don't need to record a change.  So there's a tradeoff here.  It turns
+out that always recording a change is faster.
+-}
+
+-----------------------------------------------------------------------------
+--      DG: an internal data type for 'decorated graphs'
+--          TOTALLY internal to Hoopl; each block is decorated with a fact
+-----------------------------------------------------------------------------
+
+type DG f  = Graph' (DBlock f)
+data DBlock f n e x = DBlock f (Block n e x) -- ^ block decorated with fact
+
+instance NonLocal n => NonLocal (DBlock f n) where
+  entryLabel (DBlock _ b) = entryLabel b
+  successors (DBlock _ b) = successors b
+
+--- constructors
+
+dgnil  :: DG f n O O
+dgnilC :: DG f n C C
+dgSplice  :: NonLocal n => DG f n e a -> DG f n a x -> DG f n e x
+
+---- observers
+
+normalizeGraph :: forall n f e x .
+                  NonLocal n => DG f n e x
+               -> (Graph n e x, FactBase f)
+                 -- A Graph together with the facts for that graph
+                 -- The domains of the two maps should be identical
+
+normalizeGraph g = (mapGraphBlocks dropFact g, facts g)
+    where dropFact :: DBlock t t1 t2 t3 -> Block t1 t2 t3
+          dropFact (DBlock _ b) = b
+          facts :: DG f n e x -> FactBase f
+          facts GNil = noFacts
+          facts (GUnit _) = noFacts
+          facts (GMany _ body exit) = bodyFacts body `mapUnion` exitFacts exit
+          exitFacts :: MaybeO x (DBlock f n C O) -> FactBase f
+          exitFacts NothingO = noFacts
+          exitFacts (JustO (DBlock f b)) = mapSingleton (entryLabel b) f
+          bodyFacts :: LabelMap (DBlock f n C C) -> FactBase f
+          bodyFacts body = mapFoldWithKey f noFacts body
+            where f :: forall t a x. (NonLocal t) => Label -> DBlock a t C x -> LabelMap a -> LabelMap a
+                  f lbl (DBlock f _) fb = mapInsert lbl f fb
+
+--- implementation of the constructors (boring)
+
+dgnil  = GNil
+dgnilC = GMany NothingO emptyBody NothingO
+
+dgSplice = splice fzCat
+  where fzCat :: DBlock f n e O -> DBlock t n O x -> DBlock f n e x
+        fzCat (DBlock f b1) (DBlock _ b2) = DBlock f $! b1 `blockAppend` b2
+        -- NB. strictness, this function is hammered.
+
+----------------------------------------------------------------
+--       Utilities
+----------------------------------------------------------------
+
+-- Lifting based on shape:
+--  - from nodes to blocks
+--  - from facts to fact-like things
+-- Lowering back:
+--  - from fact-like things to facts
+-- Note that the latter two functions depend only on the entry shape.
+class ShapeLifter e x where
+ singletonDG   :: f -> n e x -> DG f n e x
+ fwdEntryFact  :: NonLocal n => n e x -> f -> Fact e f
+ fwdEntryLabel :: NonLocal n => n e x -> MaybeC e [Label]
+ ftransfer :: FwdTransfer n f -> n e x -> f -> Fact x f
+ frewrite  :: FwdRewrite m n f -> n e x
+           -> f -> m (Maybe (Graph n e x, FwdRewrite m n f))
+-- @ end node.tex
+ bwdEntryFact :: NonLocal n => DataflowLattice f -> n e x -> Fact e f -> f
+ btransfer    :: BwdTransfer n f -> n e x -> Fact x f -> f
+ brewrite     :: BwdRewrite m n f -> n e x
+              -> Fact x f -> m (Maybe (Graph n e x, BwdRewrite m n f))
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+  fwdEntryFact     n f  = mapSingleton (entryLabel n) f
+  bwdEntryFact lat n fb = getFact lat (entryLabel n) fb
+  ftransfer (FwdTransfer3 (ft, _, _)) n f = ft n f
+  btransfer (BwdTransfer3 (bt, _, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (fr, _, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (br, _, _)) n f = br n f
+  fwdEntryLabel n = JustC [entryLabel n]
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, ft, _)) n f = ft n f
+  btransfer (BwdTransfer3 (_, bt, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, fr, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, br, _)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, _, ft)) n f = ft n f
+  btransfer (BwdTransfer3 (_, _, bt)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, _, fr)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, _, br)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+{-
+class ShapeLifter e x where
+  singletonDG   :: f -> n e x -> DG f n e x
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+-}
+
+-- Fact lookup: the fact `orelse` bottom
+getFact  :: DataflowLattice f -> Label -> FactBase f -> f
+getFact lat l fb = case lookupFact l fb of Just  f -> f
+                                           Nothing -> fact_bot lat
+
+
+
+{-  Note [Respects fuel]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-}
+-- $fuel
+-- A value of type 'FwdRewrite' or 'BwdRewrite' /respects fuel/ if 
+-- any function contained within the value satisfies the following properties:
+--
+--   * When fuel is exhausted, it always returns 'Nothing'.
+--
+--   * When it returns @Just g rw@, it consumes /exactly/ one unit
+--     of fuel, and new rewrite 'rw' also respects fuel.
+--
+-- Provided that functions passed to 'mkFRewrite', 'mkFRewrite3', 
+-- 'mkBRewrite', and 'mkBRewrite3' are not aware of the fuel supply,
+-- the results respect fuel.
+--
+-- It is an /unchecked/ run-time error for the argument passed to 'wrapFR',
+-- 'wrapFR2', 'wrapBR', or 'warpBR2' to return a function that does not respect fuel.
+
+-- -----------------------------------------------------------------------------
+-- a Heap of Int
+
+-- We should really use a proper Heap here, but my attempts to make
+-- one have not succeeded in beating the simple ordered list.  Another
+-- alternative is IntSet (using deleteFindMin), but that was also
+-- slower than the ordered list in my experiments --SDM 25/1/2012
+
+type IntHeap = [Int] -- ordered
+
+insertIntHeap :: Int -> [Int] -> [Int]
+insertIntHeap x [] = [x]
+insertIntHeap x (y:ys)
+  | x < y     = x : y : ys
+  | x == y    = x : ys
+  | otherwise = y : insertIntHeap x ys
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 04586b1029..ecd4d4f985 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -1,29 +1,19 @@
 {-# LANGUAGE GADTs #-}
 
--- ToDo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
--- ToDo: remove -fno-warn-incomplete-patterns
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
--- Module for building CmmAGraphs.
-
--- As the CmmAGraph is a wrapper over Graph CmmNode O x, it is different
--- from Hoopl's AGraph. The current clients expect functions with the
--- same names Hoopl uses, so this module cannot be in the same namespace
--- as Compiler.Hoopl.
-
 module MkGraph
-  ( CmmAGraph
-  , emptyAGraph, (<*>), catAGraphs, outOfLine
-  , mkLabel, mkMiddle, mkLast
-  , withFreshLabel, withUnique, lgraphOfAGraph, labelAGraph
+  ( CmmAGraph, CgStmt(..)
+  , (<*>), catAGraphs
+  , mkLabel, mkMiddle, mkLast, outOfLine
+  , lgraphOfAGraph, labelAGraph
 
   , stackStubExpr
-  , mkNop, mkAssign, mkStore, mkCall, mkCmmCall, mkSafeCall, mkUnsafeCall, mkFinalCall
-         , mkJump, mkDirectJump, mkForeignJump, mkJumpGC, mkCbranch, mkSwitch
-         , mkReturn, mkReturnSimple, mkComment, mkCallEntry
-         , mkBranch, mkCmmIfThenElse, mkCmmIfThen, mkCmmWhileDo
-         , copyInOflow, copyInSlot, copyOutOflow, copyOutSlot
+  , mkNop, mkAssign, mkStore, mkUnsafeCall, mkFinalCall, mkCallReturnsTo
+  , mkJump, mkDirectJump, mkForeignJump, mkForeignJumpExtra, mkJumpGC
+  , mkCbranch, mkSwitch
+  , mkReturn, mkReturnSimple, mkComment, mkCallEntry, mkBranch
+  , copyInOflow, copyOutOflow
+  , noExtraStack
+  , toCall, Transfer(..)
   )
 where
 
@@ -31,250 +21,232 @@ import BlockId
 import Cmm
 import CmmCallConv (assignArgumentsPos, ParamLocation(..))
 
+
 import Compiler.Hoopl hiding (Unique, (<*>), mkFirst, mkMiddle, mkLast, mkLabel, mkBranch, Shape(..))
-import qualified Compiler.Hoopl as H
-import Compiler.Hoopl.GHC (uniqueToLbl)
 import FastString
 import ForeignCall
 import Outputable
 import Prelude hiding (succ)
 import SMRep (ByteOff)
-import StaticFlags
-import Unique
 import UniqSupply
-import Util
+import OrdList
 
 #include "HsVersions.h"
 
-{-
-A 'CmmAGraph' is an abstract version of a 'Graph CmmNode O x' from module
-'Cmm'.  The difference is that the 'CmmAGraph' can be eigher open of closed at
-exit and it can supply fresh Labels and Uniques.
-
-It also supports a splicing operation <*>, which is different from the Hoopl's
-<*>, because it splices two CmmAGraphs. Specifically, it can splice Graph
-O C and Graph O x. In this case, the open beginning of the second graph is
-thrown away.  In the debug mode this sequence is checked to be empty or
-containing a branch (see note [Branch follows branch]).
-
-When an CmmAGraph open at exit is being converted to a CmmGraph, the output
-exit sequence is considered unreachable. If the graph consist of one block
-only, if it not the case and we crash. Otherwise we just throw the exit
-sequence away (and in debug mode we test that it really was unreachable).
--}
-
-{-
-Node [Branch follows branch]
-============================
-Why do we say it's ok for a Branch to follow a Branch?
-Because the standard constructor mkLabel has fall-through
-semantics. So if you do a mkLabel, you finish the current block,
-giving it a label, and start a new one that branches to that label.
-Emitting a Branch at this point is fine:
-       goto L1; L2: ...stuff...
--}
-
-data CmmGraphOC = Opened (Graph CmmNode O O)
-                | Closed (Graph CmmNode O C)
-type CmmAGraph = UniqSM CmmGraphOC     -- Graph open at entry
-
-{-
-MS: I began with
-  newtype CmmAGraph = forall x. AG (UniqSM (Graph CmmNode O x))
-but that does not work well, because we cannot take the graph
-out of the monad -- we do not know the type of what we would take
-out and pattern matching does not help, as we cannot pattern match
-on a graph inside the monad.
--}
 
-data Transfer = Call | Jump | Ret deriving Eq
+-----------------------------------------------------------------------------
+-- Building Graphs
+
+
+-- | CmmAGraph is a chunk of code consisting of:
+--
+--   * ordinary statements (assignments, stores etc.)
+--   * jumps
+--   * labels
+--   * out-of-line labelled blocks
+--
+-- The semantics is that control falls through labels and out-of-line
+-- blocks.  Everything after a jump up to the next label is by
+-- definition unreachable code, and will be discarded.
+--
+-- Two CmmAGraphs can be stuck together with <*>, with the meaning that
+-- control flows from the first to the second.
+--
+-- A 'CmmAGraph' can be turned into a 'CmmGraph' (closed at both ends)
+-- by providing a label for the entry point; see 'labelAGraph'.
+--
+type CmmAGraph = OrdList CgStmt
+
+data CgStmt
+  = CgLabel BlockId
+  | CgStmt  (CmmNode O O)
+  | CgLast  (CmmNode O C)
+  | CgFork  BlockId CmmAGraph
+
+flattenCmmAGraph :: BlockId -> CmmAGraph -> CmmGraph
+flattenCmmAGraph id stmts =
+    CmmGraph { g_entry = id,
+               g_graph = GMany NothingO body NothingO }
+  where
+  (block, blocks) = flatten (fromOL stmts)
+  entry = blockJoinHead (CmmEntry id) block
+  body = foldr addBlock emptyBody (entry:blocks)
+
+  flatten :: [CgStmt] -> (Block CmmNode O C, [Block CmmNode C C])
+  flatten [] = panic "flatten []"
+
+  -- A label at the end of a function or fork: this label must not be reachable,
+  -- but it might be referred to from another BB that also isn't reachable.
+  -- Eliminating these has to be done with a dead-code analysis.  For now,
+  -- we just make it into a well-formed block by adding a recursive jump.
+  flatten [CgLabel id]
+    = (goto_id, [blockJoinHead (CmmEntry id) goto_id] )
+    where goto_id = blockJoinTail emptyBlock (CmmBranch id)
+
+  -- A jump/branch: throw away all the code up to the next label, because
+  -- it is unreachable.  Be careful to keep forks that we find on the way.
+  flatten (CgLast stmt : stmts)
+    = case dropWhile isOrdinaryStmt stmts of
+        [] ->
+            ( sing, [] )
+        [CgLabel id] ->
+            ( sing, [blockJoin (CmmEntry id) emptyBlock (CmmBranch id)] )
+        (CgLabel id : stmts) ->
+            ( sing, blockJoinHead (CmmEntry id) block : blocks )
+            where (block,blocks) = flatten stmts
+        (CgFork fork_id stmts : ss) -> 
+            flatten (CgFork fork_id stmts : CgLast stmt : ss)
+        _ -> panic "MkGraph.flatten"
+    where
+      sing = blockJoinTail emptyBlock stmt
+
+  flatten (s:ss) = 
+        case s of
+          CgStmt stmt -> (blockCons stmt block, blocks)
+          CgLabel id  -> (blockJoinTail emptyBlock (CmmBranch id),
+                          blockJoinHead (CmmEntry id) block : blocks)
+          CgFork fork_id stmts -> 
+                (block, blockJoinHead (CmmEntry fork_id) fork_block : fork_blocks ++ blocks)
+                where (fork_block, fork_blocks) = flatten (fromOL stmts)
+          _ -> panic "MkGraph.flatten"
+    where (block,blocks) = flatten ss
+
+isOrdinaryStmt :: CgStmt -> Bool
+isOrdinaryStmt (CgStmt _) = True
+isOrdinaryStmt (CgLast _) = True
+isOrdinaryStmt _          = False
+
+
 
 ---------- AGraph manipulation
 
-emptyAGraph    :: CmmAGraph
 (<*>)          :: CmmAGraph -> CmmAGraph -> CmmAGraph
+(<*>)           = appOL
+
 catAGraphs     :: [CmmAGraph] -> CmmAGraph
+catAGraphs      = concatOL
+
+-- | created a sequence "goto id; id:" as an AGraph
+mkLabel        :: BlockId -> CmmAGraph
+mkLabel bid     = unitOL (CgLabel bid)
+
+-- | creates an open AGraph from a given node
+mkMiddle        :: CmmNode O O -> CmmAGraph
+mkMiddle middle = unitOL (CgStmt middle)
 
-mkLabel        :: BlockId     -> CmmAGraph  -- created a sequence "goto id; id:" as an AGraph
-mkMiddle       :: CmmNode O O -> CmmAGraph  -- creates an open AGraph from a given node
-mkLast         :: CmmNode O C -> CmmAGraph  -- created a closed AGraph from a given node
+-- | created a closed AGraph from a given node
+mkLast         :: CmmNode O C -> CmmAGraph
+mkLast last     = unitOL (CgLast last)
 
-withFreshLabel :: String -> (BlockId -> CmmAGraph) -> CmmAGraph
-withUnique     :: (Unique -> CmmAGraph) -> CmmAGraph
+-- | A labelled code block; should end in a last node
+outOfLine      :: BlockId -> CmmAGraph -> CmmAGraph
+outOfLine l g   = unitOL (CgFork l g)
 
+-- | allocate a fresh label for the entry point
 lgraphOfAGraph :: CmmAGraph -> UniqSM CmmGraph
-  -- ^ allocate a fresh label for the entry point
+lgraphOfAGraph g = do u <- getUniqueM
+                      return (flattenCmmAGraph (mkBlockId u) g)
+
+-- | use the given BlockId as the label of the entry point
 labelAGraph    :: BlockId -> CmmAGraph -> UniqSM CmmGraph
-  -- ^ use the given BlockId as the label of the entry point
+labelAGraph lbl ag = return (flattenCmmAGraph lbl ag)
 
 ---------- No-ops
 mkNop        :: CmmAGraph
+mkNop         = nilOL
+
 mkComment    :: FastString -> CmmAGraph
+#ifdef DEBUG
+-- SDM: generating all those comments takes time, this saved about 4% for me
+mkComment fs  = mkMiddle $ CmmComment fs
+#else
+mkComment _   = nilOL
+#endif
 
 ---------- Assignment and store
 mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
-mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkAssign l r  = mkMiddle $ CmmAssign l r
 
----------- Calls
-mkCall       :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-mkCmmCall    :: CmmExpr ->              [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-  -- Native C-- calling convention
-mkSafeCall    :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset -> Bool -> CmmAGraph
-mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
-mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-  -- Never returns; like exit() or barf()
+mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkStore  l r  = mkMiddle $ CmmStore  l r
 
 ---------- Control transfer
-mkJump          ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkDirectJump    ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkJumpGC        ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkCbranch       :: CmmExpr -> BlockId -> BlockId          -> CmmAGraph
-mkSwitch        :: CmmExpr -> [Maybe BlockId]             -> CmmAGraph
+mkJump          :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeNodeCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkDirectJump    :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkDirectJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeDirectCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkJumpGC        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJumpGC e actuals updfr_off =
+  lastWithArgs Jump Old GC actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+                -> CmmAGraph
+mkForeignJump conv e actuals updfr_off =
+  mkForeignJumpExtra conv e actuals updfr_off noExtraStack
+
+mkForeignJumpExtra :: Convention -> CmmExpr -> [CmmActual]
+                -> UpdFrameOffset -> (ByteOff, [(CmmExpr, ByteOff)])
+                -> CmmAGraph
+mkForeignJumpExtra conv e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack Jump Old conv actuals updfr_off extra_stack $
+    toCall e Nothing updfr_off 0
+
+mkCbranch       :: CmmExpr -> BlockId -> BlockId -> CmmAGraph
+mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
+
+mkSwitch        :: CmmExpr -> [Maybe BlockId] -> CmmAGraph
+mkSwitch e tbl   = mkLast $ CmmSwitch e tbl
+
 mkReturn        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturn e actuals updfr_off =
+  lastWithArgs Ret  Old NativeReturn actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
 mkReturnSimple  :: [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple actuals updfr_off =
+  mkReturn e actuals updfr_off
+  where e = CmmLoad (CmmStackSlot Old updfr_off) gcWord
 
 mkBranch        :: BlockId -> CmmAGraph
-mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> CmmAGraph
-mkCmmIfThen     :: CmmExpr -> CmmAGraph -> CmmAGraph
-mkCmmWhileDo    :: CmmExpr -> CmmAGraph -> CmmAGraph
-
-outOfLine       :: CmmAGraph -> CmmAGraph
--- ^ The argument is an CmmAGraph that must have an
--- empty entry sequence and be closed at the end.
--- The result is a new CmmAGraph that is open at the
--- end and goes directly from entry to exit, with the
--- original graph sitting to the side out-of-line.
---
--- Example:  mkMiddle (x = 3)
---           <*> outOfLine (mkLabel L <*> ...stuff...)
---           <*> mkMiddle (y = x)
--- Control will flow directly from x=3 to y=x;
--- the block starting with L is "on the side".
---
--- N.B. algebraically forall g g' : g <*> outOfLine g' == outOfLine g' <*> g
+mkBranch bid     = mkLast (CmmBranch bid)
+
+mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset
+              -> CmmAGraph
+mkFinalCall f _ actuals updfr_off =
+  lastWithArgs Call Old NativeDirectCall actuals updfr_off $
+    toCall f Nothing updfr_off 0
+
+mkCallReturnsTo :: CmmExpr -> Convention -> [CmmActual]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> (ByteOff, [(CmmExpr,ByteOff)])
+                -> CmmAGraph
+mkCallReturnsTo f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
+  lastWithArgsAndExtraStack Call (Young ret_lbl) callConv actuals
+     updfr_off extra_stack $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
+mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
+
 
 --------------------------------------------------------------------------
 
--- ================ IMPLEMENTATION ================--
-
---------------------------------------------------
--- Raw CmmAGraph handling
-
-emptyAGraph = return $ Opened emptyGraph
-ag <*> ah = do g <- ag
-               h <- ah
-               return (case (g, h) of
-                 (Opened g, Opened h) -> Opened $ g H.<*> h
-                 (Opened g, Closed h) -> Closed $ g H.<*> h
-                 (Closed g, Opened GNil) -> Closed g
-                 (Closed g, Opened (GUnit e)) -> note_unreachable e $ Closed g
-                 (Closed g, Opened (GMany (JustO e) b x)) -> note_unreachable e $ Opened $ g H.|*><*| GMany NothingO b x
-                 (Closed g, Closed (GMany (JustO e) b x)) -> note_unreachable e $ Closed $ g H.|*><*| GMany NothingO b x
-                 :: CmmGraphOC)
-catAGraphs = foldl (<*>) emptyAGraph
-
-outOfLine ag = withFreshLabel "outOfLine" $ \l ->
-               do g <- ag
-                  return (case g of
-                    Closed (GMany (JustO e) b _) -> note_unreachable e $ Opened $
-                                                      GMany (JustO $ BLast $ CmmBranch l) b (JustO $ BFirst $ CmmEntry l)
-                    _                            -> panic "outOfLine"
-                    :: CmmGraphOC)
-
-note_unreachable :: Block CmmNode O x -> a -> a
-note_unreachable block graph =
-  ASSERT (block_is_empty_or_label)  -- Note [Branch follows branch]
-  graph
-  where block_is_empty_or_label :: Bool
-        block_is_empty_or_label = case blockToNodeList block of
-                                    (NothingC, [], NothingC)            -> True
-                                    (NothingC, [], JustC (CmmBranch _)) -> True
-                                    _                                   -> False
-
-mkLabel bid = return $ Opened $ H.mkLast (CmmBranch bid) |*><*| H.mkFirst (CmmEntry bid)
-mkMiddle middle = return $ Opened $ H.mkMiddle middle
-mkLast last = return $ Closed $ H.mkLast last
-
-withUnique f = getUniqueM >>= f
-withFreshLabel _name f = getUniqueM >>= f . uniqueToLbl . intToUnique . getKey
 
-lgraphOfAGraph g = do u <- getUniqueM
-                      labelAGraph (mkBlockId u) g
-
-labelAGraph lbl ag = do g <- ag
-                        return $ CmmGraph {g_entry=lbl, g_graph=H.mkFirst (CmmEntry lbl) H.<*> closed g}
-  where closed :: CmmGraphOC -> Graph CmmNode O C
-        closed (Closed g) = g
-        closed (Opened g@(GMany entry body (JustO exit))) =
-          ASSERT (entryLabel exit `notElem` map entryLabel (postorder_dfs g))
-          GMany entry body NothingO
-        closed (Opened _) = panic "labelAGraph"
-
---------------------------------------------------
--- CmmAGraph constructions
-
-mkNop                     = emptyAGraph
-mkComment fs              = mkMiddle $ CmmComment fs
-mkStore  l r              = mkMiddle $ CmmStore  l r
-
--- NEED A COMPILER-DEBUGGING FLAG HERE
--- Sanity check: any value assigned to a pointer must be non-zero.
--- If it's 0, cause a crash immediately.
-mkAssign l r = if opt_StubDeadValues then assign l r <*> check l else assign l r
-  where assign l r = mkMiddle (CmmAssign l r)
-        check (CmmGlobal _) = mkNop
-        check l@(CmmLocal reg) = -- if a ptr arg is NULL, cause a crash!
-          if isGcPtrType ty then
-            mkCmmIfThen (CmmMachOp (MO_Eq w) [r, stackStubExpr w])
-                        (assign l (CmmLoad (CmmLit (CmmInt 0 w)) ty))
-          else mkNop
-            where ty = localRegType reg
-                  w  = typeWidth ty
-                  r  = CmmReg l
 
 
 -- Why are we inserting extra blocks that simply branch to the successors?
 -- Because in addition to the branch instruction, @mkBranch@ will insert
 -- a necessary adjustment to the stack pointer.
-mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
-mkSwitch e tbl            = mkLast $ CmmSwitch e tbl
-
-mkSafeCall   t fs as upd i = withFreshLabel "safe call" $ body
-  where
-    body k =
-     (    mkStore (CmmStackSlot (CallArea (Young k)) (widthInBytes wordWidth))
-                  (CmmLit (CmmBlock k))
-      <*> mkLast (CmmForeignCall {tgt=t, res=fs, args=as, succ=k, updfr=upd, intrbl=i})
-      <*> mkLabel k)
-mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
 
-mkBranch bid = mkLast (CmmBranch bid)
-
-mkCmmIfThenElse e tbranch fbranch =
-  withFreshLabel "end of if"     $ \endif ->
-  withFreshLabel "start of then" $ \tid ->
-  withFreshLabel "start of else" $ \fid ->
-    mkCbranch e tid fid <*>
-    mkLabel tid <*> tbranch <*> mkBranch endif <*>
-    mkLabel fid <*> fbranch <*> mkLabel endif
-
-mkCmmIfThen e tbranch
-  = withFreshLabel "end of if"     $ \endif ->
-    withFreshLabel "start of then" $ \tid ->
-      mkCbranch e tid endif <*>
-      mkLabel tid <*> tbranch <*> mkLabel endif
-
-mkCmmWhileDo e body =
-  withFreshLabel "loop test" $ \test ->
-  withFreshLabel "loop head" $ \head ->
-  withFreshLabel "end while" $ \endwhile ->
-    -- Forrest Baskett's while-loop layout
-    mkBranch test <*> mkLabel head <*> body
-                  <*> mkLabel test <*> mkCbranch e head endwhile
-                  <*> mkLabel endwhile
 
 -- For debugging purposes, we can stub out dead stack slots:
 stackStubExpr :: Width -> CmmExpr
@@ -286,12 +258,9 @@ stackStubExpr w = CmmLit (CmmInt 0 w)
 -- Therefore, for copying arguments and results, we provide different
 -- functions to pass the arguments in an overflow area and to pass them in spill slots.
 copyInOflow  :: Convention -> Area -> [CmmFormal] -> (Int, CmmAGraph)
-copyInSlot   :: Convention -> [CmmFormal] -> [CmmNode O O]
-copyOutSlot  :: Convention -> [LocalReg] -> [CmmNode O O]
 
 copyInOflow conv area formals = (offset, catAGraphs $ map mkMiddle nodes)
   where (offset, nodes) = copyIn oneCopyOflowI conv area formals
-copyInSlot c f = snd $ copyIn oneCopySlotI c (panic "no area for copying to slots") f
 
 type SlotCopier = Area -> (LocalReg, ByteOff) -> (ByteOff, [CmmNode O O]) ->
                           (ByteOff, [CmmNode O O])
@@ -312,104 +281,86 @@ copyIn oflow conv area formals =
                 adjust rst x@(_, RegisterParam _) = x : rst
 
 -- Copy-in one arg, using overflow space if needed.
-oneCopyOflowI, oneCopySlotI :: SlotCopier
+oneCopyOflowI :: SlotCopier
 oneCopyOflowI area (reg, off) (n, ms) =
   (max n off, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty) : ms)
   where ty = localRegType reg
 
--- Copy-in one arg, using spill slots if needed -- used for calling conventions at
--- a procpoint that is not a return point. The offset is irrelevant here...
-oneCopySlotI _ (reg, _) (n, ms) =
-  (n, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg) w) ty) : ms)
-  where ty = localRegType reg
-        w  = widthInBytes (typeWidth ty)
-
-
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
 
-copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual] -> UpdFrameOffset ->
-                              (Int, CmmAGraph)
+data Transfer = Call | Jump | Ret deriving Eq
+
+copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff, [(CmmExpr,ByteOff)]) -- extra stack stuff
+             -> (Int, CmmAGraph)
+
 -- Generate code to move the actual parameters into the locations
--- required by the calling convention.  This includes a store for the return address.
+-- required by the calling convention.  This includes a store for the
+-- return address.
 --
--- The argument layout function ignores the pointer to the info table, so we slot that
--- in here. When copying-out to a young area, we set the info table for return
--- and adjust the offsets of the other parameters.
--- If this is a call instruction, we adjust the offsets of the other parameters.
-copyOutOflow conv transfer area@(CallArea a) actuals updfr_off
-  = foldr co (init_offset, emptyAGraph) args'
+-- The argument layout function ignores the pointer to the info table,
+-- so we slot that in here. When copying-out to a young area, we set
+-- the info table for return and adjust the offsets of the other
+-- parameters.  If this is a call instruction, we adjust the offsets
+-- of the other parameters.
+copyOutOflow conv transfer area actuals updfr_off
+  (extra_stack_off, extra_stack_stuff)
+  = foldr co (init_offset, mkNop) (args' ++ stack_params)
   where 
     co (v, RegisterParam r) (n, ms) = (n, mkAssign (CmmGlobal r) v <*> ms)
     co (v, StackParam off)  (n, ms) = (max n off, mkStore (CmmStackSlot area off) v <*> ms)
 
+    stack_params = [ (e, StackParam (off + init_offset))
+                   | (e,off) <- extra_stack_stuff ]
+
     (setRA, init_offset) =
-      case a of Young id -> id `seq` -- Generate a store instruction for
-                                     -- the return address if making a call
+      case area of
+            Young id -> id `seq` -- Generate a store instruction for
+                                 -- the return address if making a call
                   if transfer == Call then
                     ([(CmmLit (CmmBlock id), StackParam init_offset)],
                      widthInBytes wordWidth)
                   else ([], 0)
-                Old -> ([], updfr_off)
+            Old -> ([], updfr_off)
+
+    arg_offset = init_offset + extra_stack_off
 
     args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
     args = assignArgumentsPos conv cmmExprType actuals
 
     args' = foldl adjust setRA args
-      where adjust rst   (v, StackParam off)  = (v, StackParam (off + init_offset)) : rst
+      where adjust rst   (v, StackParam off)  = (v, StackParam (off + arg_offset)) : rst
             adjust rst x@(_, RegisterParam _) = x : rst
 
-copyOutOflow _ _ (RegSlot _) _ _ = panic "cannot copy arguments into a register slot"
 
--- Args passed only in registers and stack slots; no overflow space.
--- No return address may apply!
-copyOutSlot conv actuals = foldr co [] args
-  where co (v, RegisterParam r) ms = CmmAssign (CmmGlobal r) (toExp v) : ms
-        co (v, StackParam off)  ms = CmmStore  (CmmStackSlot (RegSlot v) off) (toExp v) : ms
-        toExp r = CmmReg (CmmLocal r)
-        args = assignArgumentsPos conv localRegType actuals
 
 mkCallEntry :: Convention -> [CmmFormal] -> (Int, CmmAGraph)
-mkCallEntry conv formals = copyInOflow conv (CallArea Old) formals
+mkCallEntry conv formals = copyInOflow conv Old formals
 
-lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual] -> UpdFrameOffset ->
-                (ByteOff -> CmmAGraph) -> CmmAGraph
+lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff -> CmmAGraph)
+             -> CmmAGraph
 lastWithArgs transfer area conv actuals updfr_off last =
-  let (outArgs, copies) = copyOutOflow conv transfer area actuals updfr_off in
+  lastWithArgsAndExtraStack transfer area conv actuals
+                            updfr_off noExtraStack last
+
+lastWithArgsAndExtraStack :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset -> (ByteOff, [(CmmExpr,ByteOff)])
+             -> (ByteOff -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgsAndExtraStack transfer area conv actuals updfr_off
+                          extra_stack last =
+  let (outArgs, copies) = copyOutOflow conv transfer area actuals
+                             updfr_off extra_stack in
   copies <*> last outArgs
 
--- The area created for the jump and return arguments is the same area as the
--- procedure entry.
-old :: Area
-old = CallArea Old
-toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff -> CmmAGraph
+noExtraStack :: (ByteOff, [(CmmExpr,ByteOff)])
+noExtraStack = (0,[])
+
+toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff
+       -> CmmAGraph
 toCall e cont updfr_off res_space arg_space =
   mkLast $ CmmCall e cont arg_space res_space updfr_off
-mkJump e actuals updfr_off =
-  lastWithArgs Jump old NativeNodeCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkDirectJump e actuals updfr_off =
-  lastWithArgs Jump old NativeDirectCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkJumpGC e actuals updfr_off =
-  lastWithArgs Jump old GC actuals updfr_off $ toCall e Nothing updfr_off 0
-mkForeignJump conv e actuals updfr_off =
-  lastWithArgs Jump old conv actuals updfr_off $ toCall e Nothing updfr_off 0
-mkReturn e actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    -- where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-mkReturnSimple actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-
-mkFinalCall f _ actuals updfr_off =
-  lastWithArgs Call old NativeDirectCall actuals updfr_off $ toCall f Nothing updfr_off 0
-
-mkCmmCall f results actuals = mkCall f (NativeDirectCall, NativeReturn) results actuals
-
--- I'm dropping the SRT, but that should be okay: we plan to reconstruct it later.
-mkCall f (callConv, retConv) results actuals updfr_off =
-  withFreshLabel "call successor" $ \k ->
-    let area = CallArea $ Young k
-        (off, copyin) = copyInOflow retConv area results
-        copyout = lastWithArgs Call area callConv actuals updfr_off 
-                               (toCall f (Just k) updfr_off off)
-    in (copyout <*> mkLabel k <*> copyin)
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index fc4706c8c4..aa83afbf8d 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -9,9 +9,7 @@
 module OldCmm (
         CmmGroup, GenCmmGroup, RawCmmGroup, CmmDecl, RawCmmDecl,
         ListGraph(..),
-
-        CmmInfo(..), CmmInfoTable(..), ClosureTypeInfo(..), UpdateFrame(..),
-
+        CmmInfoTable(..), ClosureTypeInfo(..),
         CmmStatic(..), CmmStatics(..), CmmFormal, CmmActual,
 
         cmmMapGraph, cmmTopMapGraph,
@@ -49,24 +47,6 @@ import ForeignCall
 -- with assembly-language labels.
 
 -----------------------------------------------------------------------------
---     Info Tables
------------------------------------------------------------------------------
-
-data CmmInfo
-  = CmmInfo
-        (Maybe BlockId)     -- GC target. Nothing <=> CPS won't do stack check
-                            -- JD: NOT USED BY NEW CODE GEN
-        (Maybe UpdateFrame) -- Update frame
-        CmmInfoTable        -- Info table
-
--- | A frame that is to be pushed before entry to the function.
--- Used to handle 'update' frames.
-data UpdateFrame
-  = UpdateFrame
-        CmmExpr    -- Frame header.  Behaves like the target of a 'jump'.
-        [CmmExpr]  -- Frame remainder.  Behaves like the arguments of a 'jump'.
-
------------------------------------------------------------------------------
 --  Cmm, CmmDecl, CmmBasicBlock
 -----------------------------------------------------------------------------
 
@@ -85,8 +65,8 @@ data UpdateFrame
 newtype ListGraph i = ListGraph [GenBasicBlock i]
 
 -- | Cmm with the info table as a data type
-type CmmGroup = GenCmmGroup CmmStatics CmmInfo (ListGraph CmmStmt)
-type CmmDecl = GenCmmDecl CmmStatics CmmInfo (ListGraph CmmStmt)
+type CmmGroup = GenCmmGroup CmmStatics CmmInfoTable (ListGraph CmmStmt)
+type CmmDecl = GenCmmDecl CmmStatics CmmInfoTable (ListGraph CmmStmt)
 
 -- | Cmm with the info tables converted to a list of 'CmmStatic' along with the info
 -- table label. If we are building without tables-next-to-code there will be no statics
@@ -225,16 +205,9 @@ instance UserOfLocalRegs CmmCallTarget where
     foldRegsUsed f set (CmmCallee e _)    = foldRegsUsed f set e
     foldRegsUsed f set (CmmPrim _ mStmts) = foldRegsUsed f set mStmts
 
-instance UserOfSlots CmmCallTarget where
-    foldSlotsUsed f set (CmmCallee e _) = foldSlotsUsed f set e
-    foldSlotsUsed _ set (CmmPrim {})    = set
-
 instance UserOfLocalRegs a => UserOfLocalRegs (CmmHinted a) where
     foldRegsUsed f set a = foldRegsUsed f set (hintlessCmm a)
 
-instance UserOfSlots a => UserOfSlots (CmmHinted a) where
-    foldSlotsUsed f set a = foldSlotsUsed f set (hintlessCmm a)
-
 instance DefinerOfLocalRegs a => DefinerOfLocalRegs (CmmHinted a) where
     foldRegsDefd f set a = foldRegsDefd f set (hintlessCmm a)
 
diff --git a/compiler/cmm/OldCmmLint.hs b/compiler/cmm/OldCmmLint.hs
new file mode 100644
index 0000000000..72e40ce4f8
--- /dev/null
+++ b/compiler/cmm/OldCmmLint.hs
@@ -0,0 +1,209 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-- CmmLint: checking the correctness of Cmm statements and expressions
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS -fno-warn-tabs #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and
+-- detab the module (please do the detabbing in a separate patch). See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
+-- for details
+
+module OldCmmLint (
+  cmmLint, cmmLintTop
+  ) where
+
+import BlockId
+import OldCmm
+import CLabel
+import Outputable
+import OldPprCmm()
+import Constants
+import FastString
+import Platform
+
+import Data.Maybe
+
+-- -----------------------------------------------------------------------------
+-- Exported entry points:
+
+cmmLint :: (Outputable d, Outputable h)
+        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+
+cmmLintTop :: (Outputable d, Outputable h)
+           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+
+runCmmLint :: Outputable a
+           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint _ l p =
+   case unCL (l p) of
+   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                           nest 2 err,
+                           ptext $ sLit ("Program was:"),
+                           nest 2 (ppr p)])
+   Right _  -> Nothing
+
+lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
+lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
+  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
+        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
+	in  mapM_ (lintCmmBlock platform labels) blocks
+
+lintCmmDecl _ (CmmData {})
+  = return ()
+
+lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
+lintCmmBlock platform labels (BasicBlock id stmts)
+  = addLintInfo (text "in basic block " <> ppr id) $
+	mapM_ (lintCmmStmt platform labels) stmts
+
+-- -----------------------------------------------------------------------------
+-- lintCmmExpr
+
+-- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
+-- byte/word mismatches.
+
+lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
+lintCmmExpr platform (CmmLoad expr rep) = do
+  _ <- lintCmmExpr platform expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
+  return rep
+lintCmmExpr platform expr@(CmmMachOp op args) = do
+  tys <- mapM (lintCmmExpr platform) args
+  if map (typeWidth . cmmExprType) args == machOpArgReps op
+  	then cmmCheckMachOp op args tys
+	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr platform (CmmRegOff reg offset)
+  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
+		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+  where rep = typeWidth (cmmRegType reg)
+lintCmmExpr _ expr =
+  return (cmmExprType expr)
+
+-- Check for some common byte/word mismatches (eg. Sp + 1)
+cmmCheckMachOp   :: MachOp -> [CmmExpr] -> [CmmType] -> CmmLint CmmType
+cmmCheckMachOp op [lit@(CmmLit (CmmInt { })), reg@(CmmReg _)] tys
+  = cmmCheckMachOp op [reg, lit] tys
+cmmCheckMachOp op _ tys
+  = return (machOpResultType op tys)
+
+isOffsetOp :: MachOp -> Bool
+isOffsetOp (MO_Add _) = True
+isOffsetOp (MO_Sub _) = True
+isOffsetOp _ = False
+
+-- This expression should be an address from which a word can be loaded:
+-- check for funny-looking sub-word offsets.
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress _
+  = return ()
+
+-- No warnings for unaligned arithmetic with the node register,
+-- which is used to extract fields from tagged constructor closures.
+notNodeReg :: CmmExpr -> Bool
+notNodeReg (CmmReg reg) | reg == nodeReg = False
+notNodeReg _                             = True
+
+lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
+lintCmmStmt platform labels = lint
+    where lint (CmmNop) = return ()
+          lint (CmmComment {}) = return ()
+          lint stmt@(CmmAssign reg expr) = do
+            erep <- lintCmmExpr platform expr
+	    let reg_ty = cmmRegType reg
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr stmt erep reg_ty
+          lint (CmmStore l r) = do
+            _ <- lintCmmExpr platform l
+            _ <- lintCmmExpr platform r
+            return ()
+          lint (CmmCall target _res args _) =
+              do lintTarget platform labels target
+                 mapM_ (lintCmmExpr platform . hintlessCmm) args
+          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
+          lint (CmmSwitch e branches) = do
+            mapM_ checkTarget $ catMaybes branches
+            erep <- lintCmmExpr platform e
+            if (erep `cmmEqType_ignoring_ptrhood` bWord)
+              then return ()
+              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
+                               text " :: " <> ppr erep)
+          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
+          lint (CmmReturn) = return ()
+          lint (CmmBranch id)    = checkTarget id
+          checkTarget id = if setMember id labels then return ()
+                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
+lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
+                                                return ()
+lintTarget _        _      (CmmPrim _ Nothing) = return ()
+lintTarget platform labels (CmmPrim _ (Just stmts))
+    = mapM_ (lintCmmStmt platform labels) stmts
+
+
+checkCond :: CmmExpr -> CmmLint ()
+checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
+checkCond expr
+    = cmmLintErr (hang (text "expression is not a conditional:") 2
+                       (ppr expr))
+
+-- -----------------------------------------------------------------------------
+-- CmmLint monad
+
+-- just a basic error monad:
+
+newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
+
+instance Monad CmmLint where
+  CmmLint m >>= k = CmmLint $ case m of 
+				Left e -> Left e
+				Right a -> unCL (k a)
+  return a = CmmLint (Right a)
+
+cmmLintErr :: SDoc -> CmmLint a
+cmmLintErr msg = CmmLint (Left msg)
+
+addLintInfo :: SDoc -> CmmLint a -> CmmLint a
+addLintInfo info thing = CmmLint $ 
+   case unCL thing of
+	Left err -> Left (hang info 2 err)
+	Right a  -> Right a
+
+cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
+cmmLintMachOpErr expr argsRep opExpectsRep
+     = cmmLintErr (text "in MachOp application: " $$ 
+					nest 2 (ppr expr) $$
+				        (text "op is expecting: " <+> ppr opExpectsRep) $$
+					(text "arguments provide: " <+> ppr argsRep))
+
+cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr stmt e_ty r_ty
+  = cmmLintErr (text "in assignment: " $$ 
+		nest 2 (vcat [ppr stmt, 
+			      text "Reg ty:" <+> ppr r_ty,
+			      text "Rhs ty:" <+> ppr e_ty]))
+			 
+					
+
+cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
+cmmLintDubiousWordOffset expr
+   = cmmLintErr (text "offset is not a multiple of words: " $$
+			nest 2 (ppr expr))
diff --git a/compiler/cmm/OldPprCmm.hs b/compiler/cmm/OldPprCmm.hs
index d6a12221fb..9990fd26a4 100644
--- a/compiler/cmm/OldPprCmm.hs
+++ b/compiler/cmm/OldPprCmm.hs
@@ -63,10 +63,6 @@ instance Outputable instr => Outputable (GenBasicBlock instr) where
 instance Outputable CmmStmt where
     ppr s = pprStmt s
 
-instance Outputable CmmInfo where
-    ppr i = pprInfo i
-
-
 -- --------------------------------------------------------------------------
 instance Outputable CmmSafety where
   ppr CmmUnsafe = ptext (sLit "_unsafe_call_")
@@ -74,21 +70,6 @@ instance Outputable CmmSafety where
   ppr (CmmSafe srt) = ppr srt
 
 -- --------------------------------------------------------------------------
--- Info tables. The current pretty printer needs refinement
--- but will work for now.
---
--- For ideas on how to refine it, they used to be printed in the
--- style of C--'s 'stackdata' declaration, just inside the proc body,
--- and were labelled with the procedure name ++ "_info".
-pprInfo :: CmmInfo -> SDoc
-pprInfo (CmmInfo _gc_target update_frame info_table) =
-    vcat [{-ptext (sLit "gc_target: ") <>
-                maybe (ptext (sLit "<none>")) ppr gc_target,-}
-          ptext (sLit "update_frame: ") <>
-                maybe (ptext (sLit "<none>")) pprUpdateFrame update_frame,
-          ppr info_table]
-
--- --------------------------------------------------------------------------
 -- Basic blocks look like assembly blocks.
 --      lbl: stmt ; stmt ; ..
 pprBBlock :: Outputable stmt => GenBasicBlock stmt -> SDoc
@@ -156,18 +137,6 @@ pprStmt stmt = case stmt of
 instance (Outputable a) => Outputable (CmmHinted a) where
   ppr (CmmHinted a k) = ppr (a, k)
 
-pprUpdateFrame :: UpdateFrame -> SDoc
-pprUpdateFrame (UpdateFrame expr args) =
-    hcat [ ptext (sLit "jump")
-         , space
-         , if isTrivialCmmExpr expr
-                then pprExpr expr
-                else case expr of
-                    CmmLoad (CmmReg _) _ -> pprExpr expr
-                    _ -> parens (pprExpr expr)
-         , space
-         , parens  ( commafy $ map ppr args ) ]
-
 -- --------------------------------------------------------------------------
 -- goto local label. [1], section 6.6
 --
diff --git a/compiler/cmm/OptimizationFuel.hs b/compiler/cmm/OptimizationFuel.hs
deleted file mode 100644
index a85b11bcc6..0000000000
--- a/compiler/cmm/OptimizationFuel.hs
+++ /dev/null
@@ -1,142 +0,0 @@
-{-# LANGUAGE TypeFamilies #-}
--- | Optimisation fuel is used to control the amount of work the optimiser does.
---
--- Every optimisation step consumes a certain amount of fuel and stops when
--- it runs out of fuel.  This can be used e.g. to debug optimiser bugs: Run
--- the optimiser with varying amount of fuel to find out the exact number of
--- steps where a bug is introduced in the output.
-module OptimizationFuel
-    ( OptimizationFuel, amountOfFuel, tankFilledTo, unlimitedFuel, anyFuelLeft, oneLessFuel
-    , OptFuelState, initOptFuelState
-    , FuelConsumer, FuelUsingMonad, FuelState
-    , fuelGet, fuelSet, lastFuelPass, setFuelPass
-    , fuelExhausted, fuelDec1, tryWithFuel
-    , runFuelIO, runInfiniteFuelIO, fuelConsumingPass
-    , FuelUniqSM
-    , liftUniq
-    )
-where
-
-import Data.IORef
-import Control.Monad
-import StaticFlags (opt_Fuel)
-import UniqSupply
-import Panic
-import Util
-
-import Compiler.Hoopl
-import Compiler.Hoopl.GHC (getFuel, setFuel)
-
-#include "HsVersions.h"
-
-
--- We limit the number of transactions executed using a record of flags
--- stored in an HscEnv. The flags store the name of the last optimization
--- pass and the amount of optimization fuel remaining.
-data OptFuelState =
-  OptFuelState { pass_ref :: IORef String
-               , fuel_ref :: IORef OptimizationFuel
-               }
-initOptFuelState :: IO OptFuelState
-initOptFuelState =
-  do pass_ref' <- newIORef "unoptimized program"
-     fuel_ref' <- newIORef (tankFilledTo opt_Fuel)
-     return OptFuelState {pass_ref = pass_ref', fuel_ref = fuel_ref'}
-
-type FuelConsumer a = OptimizationFuel -> (a, OptimizationFuel)
-
-tankFilledTo :: Int -> OptimizationFuel
-amountOfFuel :: OptimizationFuel -> Int
-
-anyFuelLeft :: OptimizationFuel -> Bool
-oneLessFuel :: OptimizationFuel -> OptimizationFuel
-unlimitedFuel :: OptimizationFuel
-
-newtype OptimizationFuel = OptimizationFuel Int
-  deriving Show
-
-tankFilledTo = OptimizationFuel
-amountOfFuel (OptimizationFuel f) = f
-
-anyFuelLeft (OptimizationFuel f) = f > 0
-oneLessFuel (OptimizationFuel f) = ASSERT (f > 0) (OptimizationFuel (f - 1))
-unlimitedFuel = OptimizationFuel infiniteFuel
-
-data FuelState = FuelState { fs_fuel :: OptimizationFuel, fs_lastpass :: String }
-newtype FuelUniqSM a = FUSM { unFUSM :: FuelState -> UniqSM (a, FuelState) }
-
-fuelConsumingPass :: String -> FuelConsumer a -> FuelUniqSM a
-fuelConsumingPass name f = do setFuelPass name
-                              fuel <- fuelGet
-                              let (a, fuel') = f fuel
-                              fuelSet fuel'
-                              return a
-
-runFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       fuel <- readIORef (fuel_ref fs)
-       u    <- mkSplitUniqSupply 'u'
-       let (a, FuelState fuel' pass') = initUs_ u $ f (FuelState fuel pass)
-       writeIORef (pass_ref fs) pass'
-       writeIORef (fuel_ref fs) fuel'
-       return a
-
--- ToDo: Do we need the pass_ref when we are doing infinite fueld
--- transformations?
-runInfiniteFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runInfiniteFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       u <- mkSplitUniqSupply 'u'
-       let (a, FuelState _ pass') = initUs_ u $ f (FuelState unlimitedFuel pass)
-       writeIORef (pass_ref fs) pass'
-       return a
-
-instance Monad FuelUniqSM where
-  FUSM f >>= k = FUSM (\s -> f s >>= \(a, s') -> unFUSM (k a) s')
-  return a     = FUSM (\s -> return (a, s))
-
-instance MonadUnique FuelUniqSM where
-    getUniqueSupplyM = liftUniq getUniqueSupplyM
-    getUniqueM       = liftUniq getUniqueM
-    getUniquesM      = liftUniq getUniquesM
-
-liftUniq :: UniqSM x -> FuelUniqSM x
-liftUniq x = FUSM (\s -> x >>= (\u -> return (u, s)))
-
-class Monad m => FuelUsingMonad m where
-  fuelGet      :: m OptimizationFuel
-  fuelSet      :: OptimizationFuel -> m ()
-  lastFuelPass :: m String
-  setFuelPass  :: String -> m ()
-
-fuelExhausted :: FuelUsingMonad m => m Bool
-fuelExhausted = fuelGet >>= return . anyFuelLeft
-
-fuelDec1 :: FuelUsingMonad m => m ()
-fuelDec1 = fuelGet >>= fuelSet . oneLessFuel
-
-tryWithFuel :: FuelUsingMonad m => a -> m (Maybe a)
-tryWithFuel r = do f <- fuelGet
-                   if anyFuelLeft f then fuelSet (oneLessFuel f) >> return (Just r)
-                                    else return Nothing
-
-instance FuelUsingMonad FuelUniqSM where
-  fuelGet          = extract fs_fuel
-  lastFuelPass     = extract fs_lastpass
-  fuelSet fuel     = FUSM (\s -> return ((), s { fs_fuel     = fuel }))
-  setFuelPass pass = FUSM (\s -> return ((), s { fs_lastpass = pass }))
-
-extract :: (FuelState -> a) -> FuelUniqSM a
-extract f = FUSM (\s -> return (f s, s))
-
-instance FuelMonad FuelUniqSM where
-  getFuel = liftM amountOfFuel fuelGet
-  setFuel = fuelSet . tankFilledTo
-
--- Don't bother to checkpoint the unique supply; it doesn't matter
-instance CheckpointMonad FuelUniqSM where
-    type Checkpoint FuelUniqSM = FuelState
-    checkpoint = FUSM $ \fuel -> return (fuel, fuel) 
-    restart fuel = FUSM $ \_ -> return ((), fuel)
-
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 183708c08e..dee6ee881e 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -146,8 +146,6 @@ pprConvention  Slow                 = text "<slow-convention>"
 pprConvention  GC                   = text "<gc-convention>"
 pprConvention  PrimOpCall           = text "<primop-call-convention>"
 pprConvention  PrimOpReturn         = text "<primop-ret-convention>"
-pprConvention (Foreign c)           = ppr c
-pprConvention (Private {})          = text "<private-convention>"
 
 pprForeignConvention :: ForeignConvention -> SDoc
 pprForeignConvention (ForeignConvention c as rs) = ppr c <> ppr as <> ppr rs
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 7503127555..119f2b7239 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -237,12 +237,8 @@ pprLocalReg (LocalReg uniq rep)
 
 -- Stack areas
 pprArea :: Area -> SDoc
-pprArea (RegSlot r)   = hcat [ text "slot<", ppr r, text ">" ]
-pprArea (CallArea id) = pprAreaId id
-
-pprAreaId :: AreaId -> SDoc
-pprAreaId Old        = text "old"
-pprAreaId (Young id) = hcat [ text "young<", ppr id, text ">" ]
+pprArea Old        = text "old"
+pprArea (Young id) = hcat [ text "young<", ppr id, text ">" ]
 
 -- needs to be kept in syn with CmmExpr.hs.GlobalReg
 --
diff --git a/compiler/cmm/SMRep.lhs b/compiler/cmm/SMRep.lhs
index ce30799bf6..8b3308ef97 100644
--- a/compiler/cmm/SMRep.lhs
+++ b/compiler/cmm/SMRep.lhs
@@ -21,6 +21,7 @@ module SMRep (
 	StgWord, StgHalfWord, 
 	hALF_WORD_SIZE, hALF_WORD_SIZE_IN_BITS,
 	WordOff, ByteOff,
+        roundUpToWords,
 
         -- * Closure repesentation
         SMRep(..),	-- CmmInfo sees the rep; no one else does
@@ -57,6 +58,7 @@ import FastString
 
 import Data.Char( ord )
 import Data.Word
+import Data.Bits
 \end{code}
 
 
@@ -69,6 +71,9 @@ import Data.Word
 \begin{code}
 type WordOff = Int	-- Word offset, or word count
 type ByteOff = Int	-- Byte offset, or byte count
+
+roundUpToWords :: ByteOff -> ByteOff
+roundUpToWords n = (n + (wORD_SIZE - 1)) .&. (complement (wORD_SIZE - 1))
 \end{code}
 
 StgWord is a type representing an StgWord on the target platform.
@@ -93,6 +98,7 @@ hALF_WORD_SIZE_IN_BITS = 32
 #endif
 \end{code}
 
+
 %************************************************************************
 %*									*
 \subsubsection[SMRep-datatype]{@SMRep@---storage manager representation}
diff --git a/compiler/cmm/cmm-notes b/compiler/cmm/cmm-notes
index 81882c8c0e..0e6a2341f2 100644
--- a/compiler/cmm/cmm-notes
+++ b/compiler/cmm/cmm-notes
@@ -24,27 +24,10 @@ More notes (June 11)
 * Check in ClosureInfo:
      -- NB: Results here should line up with the results of SMRep.rtsClosureType
 
-* Possible refactoring: Nuke AGraph in favour of 
-      mkIfThenElse :: Expr -> Graph -> Graph -> FCode Graph
-  or even
-      mkIfThenElse :: HasUniques m => Expr -> Graph -> Graph -> m Graph
-  (Remmber that the .cmm file parser must use this function)
-
-  or parameterise FCode over its envt; the CgState part seem useful for both
-
 * "Remove redundant reloads" in CmmSpillReload should be redundant; since
   insertLateReloads is now gone, every reload is reloading a live variable.
   Test and nuke.
 
-* Stack layout is very like register assignment: find non-conflicting assigments.
-  In particular we can use colouring or linear scan (etc).
-
-  We'd fine-grain interference (on a word by word basis) to get maximum overlap.
-  But that may make very big interference graphs.  So linear scan might be
-  more attactive.
-
-  NB: linear scan does on-the-fly live range splitting.
-
 * When stubbing dead slots be careful not to write into an area that
   overlaps with an area that's in use.  So stubbing needs to *follow* 
   stack layout.
@@ -81,10 +64,6 @@ Things to do:
    Old.Cmm. We should abstract it to work on both representations, it needs only to
    convert a CmmInfoTable to [CmmStatic].
 
- - The MkGraph currenty uses a different semantics for <*> than Hoopl. Maybe
-   we could convert codeGen/StgCmm* clients to the Hoopl's semantics?
-   It's all deeply unsatisfactory.
-
  - Improve performance of Hoopl.
 
    A nofib comparison of -fasm vs -fnewcodegen nofib compilation parameters
@@ -101,18 +80,12 @@ Things to do:
    When compiling nofib, ghc-head + libraries compiled with -fnew-codegen
    is 31.4% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.zipghcoldgen.txt).
 
-   So we generate a bit better code, but it takes us longer!
-
    EZY: Also importantly, Hoopl uses dramatically more memory than the
    old code generator.
 
  - Are all blockToNodeList and blockOfNodeList really needed? Maybe we could
    splice blocks instead?
 
-   In the CmmContFlowOpt.blockConcat, using Dataflow seems too clumsy. Still,
-   a block catenation function would be probably nicer than blockToNodeList
-   / blockOfNodeList combo.
-
  - lowerSafeForeignCall seems too lowlevel. Just use Dataflow. After that
    delete splitEntrySeq from HooplUtils.
 
@@ -129,10 +102,6 @@ Things to do:
  - NB that CmmProcPoint line 283 has a hack that works around a GADT-related
    bug in 6.10.
 
- - SDM (2010-02-26) can we remove the Foreign constructor from Convention?
-   Reason: we never generate code for a function with the Foreign
-   calling convention, and the code for calling foreign calls is generated
-
  - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
    EZY (2011-04-16): The mini-inliner has been generalized and ported,
    but the constant folding and other optimizations need to still be
@@ -161,16 +130,6 @@ Things to do:
 
  - Top-level SRT threading is a bit ugly
 
- - Add type/newtype for CmmModule = [CmmGroup]    -- A module
-                        CmmGroup  = [CmmTop]      -- A .o file
-                        CmmTop    = Proc | Data   -- A procedure or data
-
- - This is a *change*: currently a CmmGroup is one function's-worth of code
-   regardless of SplitObjs.   Question: can we *always* generate M.o if there
-   is just one element in the list (rather than M/M1.o, M/M2.o etc)
-
-   One SRT per group.
-
  - See "CAFs" below; we want to totally refactor the way SRTs are calculated
 
  - Pull out Areas into its own module
diff --git a/compiler/codeGen/CgInfoTbls.hs b/compiler/codeGen/CgInfoTbls.hs
index 6c77255a62..7cdb1b6f7e 100644
--- a/compiler/codeGen/CgInfoTbls.hs
+++ b/compiler/codeGen/CgInfoTbls.hs
@@ -36,7 +36,7 @@ import CgBindery
 import CgCallConv
 import CgUtils
 import CgMonad
-import CmmBuildInfoTables
+import CmmUtils
 
 import OldCmm
 import CLabel
@@ -66,10 +66,9 @@ emitClosureCodeAndInfoTable cl_info args body
 
 -- Convert from 'ClosureInfo' to 'CmmInfo'.
 -- Not used for return points.  (The 'smRepClosureTypeInt' call would panic.)
-mkCmmInfo :: ClosureInfo -> FCode CmmInfo
+mkCmmInfo :: ClosureInfo -> FCode CmmInfoTable
 mkCmmInfo cl_info
-  = return (CmmInfo gc_target Nothing $
-            CmmInfoTable { cit_lbl  = infoTableLabelFromCI cl_info,
+  = return (CmmInfoTable { cit_lbl  = infoTableLabelFromCI cl_info,
                    	   cit_rep  = closureSMRep cl_info,
                    	   cit_prof = prof,
                    	   cit_srt  = closureSRT cl_info })
@@ -79,14 +78,6 @@ mkCmmInfo cl_info
     ty_descr_w8  = stringToWord8s (closureTypeDescr cl_info)
     val_descr_w8 = stringToWord8s (closureValDescr cl_info)
 
-    -- The gc_target is to inform the CPS pass when it inserts a stack check.
-    -- Since that pass isn't used yet we'll punt for now.
-    -- When the CPS pass is fully integrated, this should
-    -- be replaced by the label that any heap check jumped to,
-    -- so that branch can be shared by both the heap (from codeGen)
-    -- and stack checks (from the CPS pass).
-    gc_target = panic "TODO: gc_target"
-
 -------------------------------------------------------------------------
 --
 --	Generating the info table and code for a return point
@@ -105,8 +96,7 @@ emitReturnTarget name stmts
 	; blks <- cgStmtsToBlocks stmts
         ; frame <- mkStackLayout
         ; let smrep    = mkStackRep (mkLiveness frame)
-              info     = CmmInfo gc_target Nothing info_tbl
-              info_tbl = CmmInfoTable { cit_lbl  = info_lbl
+              info     = CmmInfoTable { cit_lbl  = info_lbl
                                       , cit_prof = NoProfilingInfo
                                       , cit_rep  = smrep
                                       , cit_srt  = srt_info }
@@ -118,14 +108,6 @@ emitReturnTarget name stmts
     info_lbl  = mkReturnInfoLabel uniq
     entry_lbl = mkReturnPtLabel uniq
 
-    -- The gc_target is to inform the CPS pass when it inserts a stack check.
-    -- Since that pass isn't used yet we'll punt for now.
-    -- When the CPS pass is fully integrated, this should
-    -- be replaced by the label that any heap check jumped to,
-    -- so that branch can be shared by both the heap (from codeGen)
-    -- and stack checks (from the CPS pass).
-    gc_target = panic "TODO: gc_target"
-
 -- Build stack layout information from the state of the 'FCode' monad.
 -- Should go away once 'codeGen' starts using the CPS conversion
 -- pass to handle the stack.  Until then, this is really just
@@ -375,8 +357,8 @@ funInfoTable info_ptr
 
 emitInfoTableAndCode 
 	:: CLabel 		-- Label of entry or ret
-	-> CmmInfo 		-- ...the info table
-	-> [CmmFormal]	-- ...args
+        -> CmmInfoTable         -- ...the info table
+        -> [CmmFormal]          -- ...args
 	-> [CmmBasicBlock]	-- ...and body
 	-> Code
 
diff --git a/compiler/codeGen/CgMonad.lhs b/compiler/codeGen/CgMonad.lhs
index b96898f591..71da9e9ae0 100644
--- a/compiler/codeGen/CgMonad.lhs
+++ b/compiler/codeGen/CgMonad.lhs
@@ -13,8 +13,8 @@ stuff fits into the Big Picture.
 module CgMonad (
         Code, FCode,
 
-        initC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
-        returnFC, fixC, fixC_, checkedAbsC,
+        initC, runC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
+        returnFC, fixC, fixC_, checkedAbsC, 
         stmtC, stmtsC, labelC, emitStmts, nopC, whenC, newLabelC,
         newUnique, newUniqSupply,
 
@@ -386,11 +386,12 @@ instance Monad FCode where
 {-# INLINE thenFC #-}
 {-# INLINE returnFC #-}
 
-initC :: DynFlags -> Module -> FCode a -> IO a
-initC dflags mod (FCode code) = do
-    uniqs <- mkSplitUniqSupply 'c'
-    case code (initCgInfoDown dflags mod) (initCgState uniqs) of
-        (res, _) -> return res
+initC :: IO CgState
+initC  = do { uniqs <- mkSplitUniqSupply 'c'
+            ; return (initCgState uniqs) }
+
+runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState)
+runC dflags mod st (FCode code) = code (initCgInfoDown dflags mod) st
 
 returnFC :: a -> FCode a
 returnFC val = FCode $ \_ state -> (val, state)
@@ -708,7 +709,7 @@ emitDecl decl = do
     state <- getState
     setState $ state { cgs_tops = cgs_tops state `snocOL` decl }
 
-emitProc :: CmmInfo -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code
+emitProc :: CmmInfoTable -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code
 emitProc info lbl [] blocks = do
     let proc_block = CmmProc info lbl (ListGraph blocks)
     state <- getState
@@ -720,7 +721,7 @@ emitSimpleProc :: CLabel -> Code -> Code
 emitSimpleProc lbl code = do
     stmts <- getCgStmts code
     blks <- cgStmtsToBlocks stmts
-    emitProc (CmmInfo Nothing Nothing CmmNonInfoTable) lbl [] blks
+    emitProc CmmNonInfoTable lbl [] blks
 
 -- Get all the CmmTops (there should be no stmts)
 -- Return a single Cmm which may be split from other Cmms by
diff --git a/compiler/codeGen/CodeGen.lhs b/compiler/codeGen/CodeGen.lhs
index ce12d43bbf..c9b2bf8ab0 100644
--- a/compiler/codeGen/CodeGen.lhs
+++ b/compiler/codeGen/CodeGen.lhs
@@ -30,7 +30,7 @@ import CgHpc
 
 import CLabel
 import OldCmm
-import OldPprCmm
+import OldPprCmm ()
 
 import StgSyn
 import PrelNames
@@ -45,40 +45,52 @@ import TyCon
 import Module
 import ErrUtils
 import Panic
+import Outputable
 import Util
 
+import OrdList
+import Stream (Stream, liftIO)
+import qualified Stream
+
+import Data.IORef
+
 codeGen :: DynFlags
         -> Module                     -- Module we are compiling
         -> [TyCon]                    -- Type constructors
         -> CollectedCCs               -- (Local/global) cost-centres needing declaring/registering.
         -> [(StgBinding,[(Id,[Id])])] -- Bindings to convert, with SRTs
         -> HpcInfo                    -- Profiling info
-        -> IO [CmmGroup]
+        -> Stream IO CmmGroup ()
               -- N.B. returning '[Cmm]' and not 'Cmm' here makes it
               -- possible for object splitting to split up the
               -- pieces later.
 
-codeGen dflags this_mod data_tycons cost_centre_info stg_binds hpc_info = do
-    showPass dflags "CodeGen"
-    code_stuff <-
-        initC dflags this_mod $ do
-            cmm_binds  <- mapM (getCmm . cgTopBinding dflags) stg_binds
-            cmm_tycons <- mapM cgTyCon data_tycons
-            cmm_init   <- getCmm (mkModuleInit dflags cost_centre_info this_mod hpc_info)
-            return (cmm_init : cmm_binds ++ cmm_tycons)
-                -- Put datatype_stuff after code_stuff, because the
-                -- datatype closure table (for enumeration types) to
-                -- (say) PrelBase_True_closure, which is defined in
-                -- code_stuff
-
-                -- Note [codegen-split-init] the cmm_init block must
-                -- come FIRST.  This is because when -split-objs is on
-                -- we need to combine this block with its
-                -- initialisation routines; see Note
-                -- [pipeline-split-init].
-
-    dumpIfSet_dyn dflags Opt_D_dump_cmm "Cmm" (pprCmms code_stuff)
-    return code_stuff
+codeGen dflags this_mod data_tycons cost_centre_info stg_binds hpc_info
+
+   = do { liftIO $ showPass dflags "CodeGen"
+
+        ; cgref <- liftIO $ newIORef =<< initC
+        ; let cg :: FCode CmmGroup -> Stream IO CmmGroup ()
+              cg fcode = do
+                cmm <- liftIO $ do
+                         st <- readIORef cgref
+                         let (a,st') = runC dflags this_mod st fcode
+
+                         dumpIfSet_dyn dflags Opt_D_dump_cmm "Cmm" $ ppr a
+
+                         -- NB. stub-out cgs_tops and cgs_stmts.  This fixes
+                         -- a big space leak.  DO NOT REMOVE!
+                         writeIORef cgref $! st'{ cgs_tops = nilOL,
+                                                  cgs_stmts = nilOL }
+                         return a
+                Stream.yield cmm
+
+        ; cg (getCmm $ mkModuleInit dflags cost_centre_info this_mod hpc_info)
+
+        ; mapM_ (cg . getCmm . cgTopBinding dflags) stg_binds
+
+        ; mapM_ (cg . cgTyCon) data_tycons
+        }
 
 mkModuleInit
         :: DynFlags
diff --git a/compiler/codeGen/StgCmm.hs b/compiler/codeGen/StgCmm.hs
index 17a7062559..696af8107e 100644
--- a/compiler/codeGen/StgCmm.hs
+++ b/compiler/codeGen/StgCmm.hs
@@ -46,6 +46,13 @@ import TyCon
 import Module
 import ErrUtils
 import Outputable
+import Stream
+
+import OrdList
+import MkGraph
+
+import Data.IORef
+import Control.Monad (when)
 import Util
 
 codeGen :: DynFlags
@@ -54,39 +61,51 @@ codeGen :: DynFlags
          -> CollectedCCs                -- (Local/global) cost-centres needing declaring/registering.
 	 -> [(StgBinding,[(Id,[Id])])]	-- Bindings to convert, with SRTs
 	 -> HpcInfo
-         -> IO [CmmGroup]         -- Output
+         -> Stream IO CmmGroup ()       -- Output as a stream, so codegen can
+                                        -- be interleaved with output
 
 codeGen dflags this_mod data_tycons
         cost_centre_info stg_binds hpc_info
-  = do  { showPass dflags "New CodeGen"
-
--- Why?
---   ; mapM_ (\x -> seq x (return ())) data_tycons
-
-        ; code_stuff <- initC dflags this_mod $ do 
-                { cmm_binds  <- mapM (getCmm . cgTopBinding dflags) stg_binds
-                ; cmm_tycons <- mapM cgTyCon data_tycons
-                ; cmm_init   <- getCmm (mkModuleInit cost_centre_info
-                                             this_mod hpc_info)
-                ; return (cmm_init : cmm_binds ++ cmm_tycons)
-                }
+  = do  { liftIO $ showPass dflags "New CodeGen"
+
+              -- cg: run the code generator, and yield the resulting CmmGroup
+              -- Using an IORef to store the state is a bit crude, but otherwise
+              -- we would need to add a state monad layer.
+        ; cgref <- liftIO $ newIORef =<< initC
+        ; let cg :: FCode () -> Stream IO CmmGroup ()
+              cg fcode = do
+                cmm <- liftIO $ do
+                         st <- readIORef cgref
+                         let (a,st') = runC dflags this_mod st (getCmm fcode)
+
+                         -- NB. stub-out cgs_tops and cgs_stmts.  This fixes
+                         -- a big space leak.  DO NOT REMOVE!
+                         writeIORef cgref $! st'{ cgs_tops = nilOL,
+                                                  cgs_stmts = mkNop }
+                         return a
+                yield cmm
+
+               -- Note [codegen-split-init] the cmm_init block must come
+               -- FIRST.  This is because when -split-objs is on we need to
+               -- combine this block with its initialisation routines; see
+               -- Note [pipeline-split-init].
+        ; cg (mkModuleInit cost_centre_info this_mod hpc_info)
+
+        ; mapM_ (cg . cgTopBinding dflags) stg_binds
+
                 -- Put datatype_stuff after code_stuff, because the
                 -- datatype closure table (for enumeration types) to
                 -- (say) PrelBase_True_closure, which is defined in
                 -- code_stuff
-
-                -- N.B. returning '[Cmm]' and not 'Cmm' here makes it
-                -- possible for object splitting to split up the
-                -- pieces later.
-
-                -- Note [codegen-split-init] the cmm_init block must
-                -- come FIRST.  This is because when -split-objs is on
-                -- we need to combine this block with its
-                -- initialisation routines; see Note
-                -- [pipeline-split-init].
-
-        ; return code_stuff }
-
+        ; let do_tycon tycon = do
+                -- Generate a table of static closures for an
+                -- enumeration type Note that the closure pointers are
+                -- tagged.
+                 when (isEnumerationTyCon tycon) $ cg (cgEnumerationTyCon tycon)
+                 mapM_ (cg . cgDataCon) (tyConDataCons tycon)
+
+        ; mapM_ do_tycon data_tycons
+        }
 
 ---------------------------------------------------------------
 --	Top-level bindings
@@ -108,7 +127,7 @@ cgTopBinding dflags (StgNonRec id rhs, _srts)
 	; info <- cgTopRhs id' rhs
 	; addBindC (cg_id info) info -- Add the *un-externalised* Id to the envt,
 				     -- so we find it when we look up occurrences
-	}
+        }
 
 cgTopBinding dflags (StgRec pairs, _srts)
   = do	{ let (bndrs, rhss) = unzip pairs
@@ -117,7 +136,7 @@ cgTopBinding dflags (StgRec pairs, _srts)
 	; fixC_(\ new_binds -> do 
 		{ addBindsC new_binds
 		; mapFCs ( \ (b,e) -> cgTopRhs b e ) pairs' })
-	; return () }
+        ; return () }
 
 -- Urgh!  I tried moving the forkStatics call from the rhss of cgTopRhs
 -- to enclose the listFCs in cgTopBinding, but that tickled the
@@ -187,65 +206,19 @@ mkModuleInit cost_centre_info this_mod hpc_info
         ; emitDecl (CmmData Data (Statics (mkPlainModuleInitLabel this_mod) []))
         }
 
+
 ---------------------------------------------------------------
 --	Generating static stuff for algebraic data types
 ---------------------------------------------------------------
 
-{-	[These comments are rather out of date]
-
-Macro  		    	     Kind of constructor
-CONST_INFO_TABLE@	Zero arity (no info -- compiler uses static closure)
-CHARLIKE_INFO_TABLE	Charlike   (no info -- compiler indexes fixed array)
-INTLIKE_INFO_TABLE	Intlike; the one macro generates both info tbls
-SPEC_INFO_TABLE		SPECish, and bigger than or equal to MIN_UPD_SIZE
-GEN_INFO_TABLE		GENish (hence bigger than or equal to MIN_UPD_SIZE@)
 
-Possible info tables for constructor con:
-
-* _con_info:
-  Used for dynamically let(rec)-bound occurrences of
-  the constructor, and for updates.  For constructors
-  which are int-like, char-like or nullary, when GC occurs,
-  the closure tries to get rid of itself.
-
-* _static_info:
-  Static occurrences of the constructor macro: STATIC_INFO_TABLE.
-
-For zero-arity constructors, \tr{con}, we NO LONGER generate a static closure;
-it's place is taken by the top level defn of the constructor.
-
-For charlike and intlike closures there is a fixed array of static
-closures predeclared.
--}
-
-cgTyCon :: TyCon -> FCode CmmGroup  -- All constructors merged together
-cgTyCon tycon
-  = do	{ constrs <- mapM (getCmm . cgDataCon) (tyConDataCons tycon)
-
-	    -- Generate a table of static closures for an enumeration type
-	    -- Put the table after the data constructor decls, because the
-	    -- datatype closure table (for enumeration types)
-	    -- to (say) PrelBase_$wTrue_closure, which is defined in code_stuff
-            -- Note that the closure pointers are tagged.
-
-            -- N.B. comment says to put table after constructor decls, but
-            -- code puts it before --- NR 16 Aug 2007
-	; extra <- cgEnumerationTyCon tycon
-
-        ; return (concat (extra ++ constrs))
-        }
-
-cgEnumerationTyCon :: TyCon -> FCode [CmmGroup]
+cgEnumerationTyCon :: TyCon -> FCode ()
 cgEnumerationTyCon tycon
-  | isEnumerationTyCon tycon
-  = do	{ tbl <- getCmm $ 
-		 emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs)
-	      	   [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs) 
-				 (tagForCon con)
-    	      	   | con <- tyConDataCons tycon]
-	; return [tbl] }
-  | otherwise
-  = return []
+  = emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs)
+             [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs)
+                           (tagForCon con)
+             | con <- tyConDataCons tycon]
+
 
 cgDataCon :: DataCon -> FCode ()
 -- Generate the entry code, info tables, and (for niladic constructor)
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs
index 3b166e3b6a..f98283f737 100644
--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -76,17 +76,17 @@ cgTopRhsClosure :: Id
 cgTopRhsClosure id ccs _ upd_flag srt args body = do
   {	-- LAY OUT THE OBJECT
     let name = idName id
-  ; lf_info  <- mkClosureLFInfo id TopLevel [] upd_flag args
-  ; srt_info <- getSRTInfo srt
+  ; lf_info <- mkClosureLFInfo id TopLevel [] upd_flag args
+  ; has_srt <- getSRTInfo srt
   ; mod_name <- getModuleName
   ; dflags   <- getDynFlags
   ; let descr         = closureDescription dflags mod_name name
-	closure_info  = mkClosureInfo True id lf_info 0 0 srt_info descr
-	closure_label = mkLocalClosureLabel name (idCafInfo id)
+        closure_info  = mkClosureInfo True id lf_info 0 0 descr
+        closure_label = mkLocalClosureLabel name (idCafInfo id)
     	cg_id_info    = litIdInfo id lf_info (CmmLabel closure_label)
         caffy         = idCafInfo id
         info_tbl      = mkCmmInfo closure_info -- XXX short-cut
-        closure_rep   = mkStaticClosureFields info_tbl ccs caffy []
+        closure_rep   = mkStaticClosureFields info_tbl ccs caffy has_srt []
 
   	 -- BUILD THE OBJECT, AND GENERATE INFO TABLE (IF NECESSARY)
   ; emitDataLits closure_label closure_rep
@@ -110,7 +110,7 @@ cgBind (StgNonRec name rhs)
         ; emit (init <*> body) }
 
 cgBind (StgRec pairs)
-  = do	{ ((new_binds, inits), body) <- getCodeR $ fixC (\ new_binds_inits ->
+  = do  { ((new_binds, inits), body) <- getCodeR $ fixC (\ new_binds_inits ->
                do { addBindsC $ fst new_binds_inits -- avoid premature deconstruction
                   ; liftM unzip $ listFCs [ cgRhs b e | (b,e) <- pairs ] })
        ; addBindsC new_binds
@@ -162,8 +162,8 @@ cgRhs :: Id -> StgRhs -> FCode (CgIdInfo, CmmAGraph)
 cgRhs name (StgRhsCon cc con args)
   = buildDynCon name cc con args
 
-cgRhs name (StgRhsClosure cc bi fvs upd_flag srt args body)
-  = mkRhsClosure name cc bi (nonVoidIds fvs) upd_flag srt args body
+cgRhs name (StgRhsClosure cc bi fvs upd_flag _srt args body)
+  = mkRhsClosure name cc bi (nonVoidIds fvs) upd_flag args body
 
 ------------------------------------------------------------------------
 --		Non-constructor right hand sides
@@ -171,7 +171,7 @@ cgRhs name (StgRhsClosure cc bi fvs upd_flag srt args body)
 
 mkRhsClosure :: Id -> CostCentreStack -> StgBinderInfo
 	     -> [NonVoid Id]			-- Free vars
-	     -> UpdateFlag -> SRT
+             -> UpdateFlag
 	     -> [Id]			        -- Args
 	     -> StgExpr
 	     -> FCode (CgIdInfo, CmmAGraph)
@@ -215,8 +215,7 @@ for semi-obvious reasons.
 mkRhsClosure	bndr cc bi
 		[NonVoid the_fv]   		-- Just one free var
 		upd_flag		-- Updatable thunk
-		_srt
-		[]			-- A thunk
+                []                      -- A thunk
 		body@(StgCase (StgApp scrutinee [{-no args-}])
 		      _ _ _ _   -- ignore uniq, etc.
 		      (AlgAlt _)
@@ -247,8 +246,7 @@ mkRhsClosure	bndr cc bi
 mkRhsClosure    bndr cc bi
 		fvs
 		upd_flag
-		_srt
-		[]			-- No args; a thunk
+                []                      -- No args; a thunk
 		body@(StgApp fun_id args)
 
   | args `lengthIs` (arity-1)
@@ -269,7 +267,7 @@ mkRhsClosure    bndr cc bi
 	arity 	= length fvs
 
 ---------- Default case ------------------
-mkRhsClosure bndr cc _ fvs upd_flag srt args body
+mkRhsClosure bndr cc _ fvs upd_flag args body
   = do	{ 	-- LAY OUT THE OBJECT
 	-- If the binder is itself a free variable, then don't store
 	-- it in the closure.  Instead, just bind it to Node on entry.
@@ -288,17 +286,16 @@ mkRhsClosure bndr cc _ fvs upd_flag srt args body
 	-- MAKE CLOSURE INFO FOR THIS CLOSURE
 	; lf_info <- mkClosureLFInfo bndr NotTopLevel fvs upd_flag args
 	; mod_name <- getModuleName
-	; c_srt <- getSRTInfo srt
-	; dflags <- getDynFlags
-	; let	name  = idName bndr
-		descr = closureDescription dflags mod_name name
-		fv_details :: [(NonVoid Id, VirtualHpOffset)]
+        ; dflags <- getDynFlags
+        ; let   name  = idName bndr
+                descr = closureDescription dflags mod_name name
+                fv_details :: [(NonVoid Id, VirtualHpOffset)]
 		(tot_wds, ptr_wds, fv_details)
 		   = mkVirtHeapOffsets (isLFThunk lf_info)
 				       (addIdReps (map stripNV reduced_fvs))
 		closure_info = mkClosureInfo False	-- Not static
 					     bndr lf_info tot_wds ptr_wds
-					     c_srt descr
+                                             descr
 
 	-- BUILD ITS INFO TABLE AND CODE
 	; forkClosureBody $
@@ -345,8 +342,7 @@ cgStdThunk bndr _cc _bndr_info _body lf_info payload
 	descr = closureDescription dflags mod_name (idName bndr)
 	closure_info = mkClosureInfo False 	-- Not static
 				     bndr lf_info tot_wds ptr_wds
-				     NoC_SRT	-- No SRT for a std-form closure
-				     descr
+                                     descr
 
 --  ; (use_cc, blame_cc) <- chooseDynCostCentres cc [{- no args-}] body
   ; let use_cc = curCCS; blame_cc = curCCS
@@ -546,10 +542,10 @@ emitBlackHoleCode is_single_entry = do
 
   whenC eager_blackholing $ do
     tickyBlackHole (not is_single_entry)
-    emit (mkStore (cmmOffsetW (CmmReg nodeReg) fixedHdrSize)
-                  (CmmReg (CmmGlobal CurrentTSO)))
+    emitStore (cmmOffsetW (CmmReg nodeReg) fixedHdrSize)
+                  (CmmReg (CmmGlobal CurrentTSO))
     emitPrimCall [] MO_WriteBarrier []
-    emit (mkStore (CmmReg nodeReg) (CmmReg (CmmGlobal EagerBlackholeInfo)))
+    emitStore (CmmReg nodeReg) (CmmReg (CmmGlobal EagerBlackholeInfo))
 
 setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
 	-- Nota Bene: this function does not change Node (even if it's a CAF),
@@ -598,7 +594,7 @@ pushUpdateFrame es body
        offset <- foldM push updfr es
        withUpdFrameOff offset body
      where push off e =
-             do emit (mkStore (CmmStackSlot (CallArea Old) base) e)
+             do emitStore (CmmStackSlot Old base) e
                 return base
              where base = off + widthInBytes (cmmExprWidth e)
 
@@ -666,13 +662,14 @@ link_caf _is_upd = do
         -- node is live, so save it.
 
   -- see Note [atomic CAF entry] in rts/sm/Storage.c
-  ; emit $ mkCmmIfThen
-      (CmmMachOp mo_wordEq [ CmmReg (CmmLocal ret), CmmLit zeroCLit]) $
+  ; updfr  <- getUpdFrameOff
+  ; emit =<< mkCmmIfThen
+      (CmmMachOp mo_wordEq [ CmmReg (CmmLocal ret), CmmLit zeroCLit])
         -- re-enter R1.  Doing this directly is slightly dodgy; we're
         -- assuming lots of things, like the stack pointer hasn't
         -- moved since we entered the CAF.
-        let target = entryCode (closureInfoPtr (CmmReg nodeReg)) in
-        mkJump target [] 0
+       (let target = entryCode (closureInfoPtr (CmmReg nodeReg)) in
+        mkJump target [] updfr)
 
   ; return hp_rel }
 
diff --git a/compiler/codeGen/StgCmmClosure.hs b/compiler/codeGen/StgCmmClosure.hs
index 483a67c1fa..8023abddec 100644
--- a/compiler/codeGen/StgCmmClosure.hs
+++ b/compiler/codeGen/StgCmmClosure.hs
@@ -657,7 +657,6 @@ data ClosureInfo
           -- the rest is just an unpacked CmmInfoTable.
         closureInfoLabel :: !CLabel,
         closureSMRep     :: !SMRep,          -- representation used by storage mgr
-        closureSRT       :: !C_SRT,          -- What SRT applies to this closure
         closureProf      :: !ProfilingInfo
     }
 
@@ -667,7 +666,7 @@ mkCmmInfo ClosureInfo {..}
   = CmmInfoTable { cit_lbl  = closureInfoLabel
                  , cit_rep  = closureSMRep
                  , cit_prof = closureProf
-                 , cit_srt  = closureSRT }
+                 , cit_srt  = NoC_SRT }
 
 
 --------------------------------------
@@ -678,16 +677,14 @@ mkClosureInfo :: Bool		-- Is static
 	      -> Id
 	      -> LambdaFormInfo 
 	      -> Int -> Int	-- Total and pointer words
-	      -> C_SRT
-	      -> String		-- String descriptor
+              -> String         -- String descriptor
 	      -> ClosureInfo
-mkClosureInfo is_static id lf_info tot_wds ptr_wds srt_info val_descr
+mkClosureInfo is_static id lf_info tot_wds ptr_wds val_descr
   = ClosureInfo { closureName      = name,
                   closureLFInfo    = lf_info,
-                  closureInfoLabel = info_lbl,
-                  closureSMRep     = sm_rep,    -- These four fields are a
-                  closureSRT       = srt_info,  --        CmmInfoTable
-                  closureProf      = prof }     -- ---
+                  closureInfoLabel = info_lbl,  -- These three fields are
+                  closureSMRep     = sm_rep,    -- (almost) an info table
+                  closureProf      = prof }     -- (we don't have an SRT yet)
   where
     name       = idName id
     sm_rep     = mkHeapRep is_static ptr_wds nonptr_wds (lfClosureType lf_info)
@@ -920,15 +917,21 @@ cafBlackHoleInfoTable
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-staticClosureNeedsLink :: CmmInfoTable -> Bool
+staticClosureNeedsLink :: Bool -> CmmInfoTable -> Bool
 -- A static closure needs a link field to aid the GC when traversing
 -- the static closure graph.  But it only needs such a field if either
 -- 	a) it has an SRT
 --	b) it's a constructor with one or more pointer fields
 -- In case (b), the constructor's fields themselves play the role
 -- of the SRT.
-staticClosureNeedsLink info_tbl@CmmInfoTable{ cit_rep = smrep }
+--
+-- At this point, the cit_srt field has not been calculated (that
+-- happens right at the end of the Cmm pipeline), but we do have the
+-- VarSet of CAFs that CoreToStg attached, and if that is empty there
+-- will definitely not be an SRT.
+--
+staticClosureNeedsLink has_srt CmmInfoTable{ cit_rep = smrep }
   | isConRep smrep         = not (isStaticNoCafCon smrep)
-  | otherwise              = needsSRT (cit_srt info_tbl)
-staticClosureNeedsLink _ = False
+  | otherwise              = has_srt -- needsSRT (cit_srt info_tbl)
+staticClosureNeedsLink _ _ = False
 
diff --git a/compiler/codeGen/StgCmmCon.hs b/compiler/codeGen/StgCmmCon.hs
index a7af5662e9..c348570a54 100644
--- a/compiler/codeGen/StgCmmCon.hs
+++ b/compiler/codeGen/StgCmmCon.hs
@@ -92,6 +92,7 @@ cgTopRhsCon id con args
                              info_tbl
                              dontCareCCS                -- Because it's static data
                              caffy                      -- Has CAF refs
+                             False                      -- no SRT
                              payload
 
                 -- BUILD THE OBJECT
diff --git a/compiler/codeGen/StgCmmEnv.hs b/compiler/codeGen/StgCmmEnv.hs
index f128e3ad60..2edd09da12 100644
--- a/compiler/codeGen/StgCmmEnv.hs
+++ b/compiler/codeGen/StgCmmEnv.hs
@@ -27,7 +27,7 @@ module StgCmmEnv (
 
 	bindArgsToRegs, bindToReg, rebindToReg,
 	bindArgToReg, idToReg,
-	getArgAmode, getNonVoidArgAmodes, 
+        getArgAmode, getNonVoidArgAmodes,
 	getCgIdInfo, 
 	maybeLetNoEscape, 
     ) where
@@ -212,7 +212,6 @@ getNonVoidArgAmodes (arg:args)
 	 	   ; amodes <- getNonVoidArgAmodes args
 	 	   ; return ( amode : amodes ) }
 
-
 ------------------------------------------------------------------------
 --	Interface functions for binding and re-binding names
 ------------------------------------------------------------------------
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs
index 9faad02f46..68bfb6d9fe 100644
--- a/compiler/codeGen/StgCmmExpr.hs
+++ b/compiler/codeGen/StgCmmExpr.hs
@@ -77,7 +77,7 @@ cgExpr (StgLetNoEscape _ _ binds expr) =
      ; let join_id = mkBlockId (uniqFromSupply us)
      ; cgLneBinds join_id binds
      ; cgExpr expr 
-     ; emit $ mkLabel join_id}
+     ; emitLabel join_id}
 
 cgExpr (StgCase expr _live_vars _save_vars bndr srt alt_type alts) =
   cgCase expr bndr srt alt_type alts
@@ -130,7 +130,7 @@ cgLetNoEscapeRhs
 cgLetNoEscapeRhs join_id local_cc bndr rhs =
   do { (info, rhs_body) <- getCodeR $ cgLetNoEscapeRhsBody local_cc bndr rhs 
      ; let (bid, _) = expectJust "cgLetNoEscapeRhs" $ maybeLetNoEscape info
-     ; emit (outOfLine $ mkLabel bid <*> rhs_body <*> mkBranch join_id)
+     ; emitOutOfLine bid $ rhs_body <*> mkBranch join_id
      ; return info
      }
 
@@ -278,21 +278,69 @@ Hence: two basic plans for
 data GcPlan
   = GcInAlts 		-- Put a GC check at the start the case alternatives,
 	[LocalReg] 	-- which binds these registers
-	SRT		-- using this SRT
-  | NoGcInAlts		-- The scrutinee is a primitive value, or a call to a
+  | NoGcInAlts          -- The scrutinee is a primitive value, or a call to a
 			-- primitive op which does no GC.  Absorb the allocation
 			-- of the case alternative(s) into the upstream check
 
 -------------------------------------
--- See Note [case on Bool]
 cgCase :: StgExpr -> Id -> SRT -> AltType -> [StgAlt] -> FCode ()
+
+cgCase (StgOpApp (StgPrimOp op) args _) bndr _srt (AlgAlt tycon) alts
+  | isEnumerationTyCon tycon -- Note [case on bool]
+  = do { tag_expr <- do_enum_primop op args
+
+       -- If the binder is not dead, convert the tag to a constructor
+       -- and assign it.
+       ; when (not (isDeadBinder bndr)) $ do
+            { tmp_reg <- bindArgToReg (NonVoid bndr)
+            ; emitAssign (CmmLocal tmp_reg)
+                         (tagToClosure tycon tag_expr) }
+
+       ; (mb_deflt, branches) <- cgAlgAltRhss NoGcInAlts Nothing
+                                              (NonVoid bndr) alts
+       ; emitSwitch tag_expr branches mb_deflt 0 (tyConFamilySize tycon - 1)
+       }
+  where
+    do_enum_primop :: PrimOp -> [StgArg] -> FCode CmmExpr
+    do_enum_primop TagToEnumOp [arg]  -- No code!
+      = getArgAmode (NonVoid arg)
+    do_enum_primop primop args
+      = do tmp <- newTemp bWord
+           cgPrimOp [tmp] primop args
+           return (CmmReg (CmmLocal tmp))
+
 {-
-cgCase (OpApp ) bndr srt AlgAlt [(DataAlt flase, a2]
-  | isBoolTy (idType bndr)
-  , isDeadBndr bndr
-  = 
+Note [case on bool]
+
+This special case handles code like
+
+  case a <# b of
+    True ->
+    False ->
+
+If we let the ordinary case code handle it, we'll get something like
+
+ tmp1 = a < b
+ tmp2 = Bool_closure_tbl[tmp1]
+ if (tmp2 & 7 != 0) then ... // normal tagged case
+
+but this junk won't optimise away.  What we really want is just an
+inline comparison:
+
+ if (a < b) then ...
+
+So we add a special case to generate
+
+ tmp1 = a < b
+ if (tmp1 == 0) then ...
+
+and later optimisations will further improve this.
+
+We should really change all these primops to return Int# instead, that
+would make this special case go away.
 -}
 
+
   -- Note [ticket #3132]: we might be looking at a case of a lifted Id
   -- that was cast to an unlifted type.  The Id will always be bottom,
   -- but we don't want the code generator to fall over here.  If we
@@ -319,7 +367,7 @@ cgCase (StgApp v []) bndr _ alt_type@(PrimAlt _) alts
     do { when (not reps_compatible) $
            panic "cgCase: reps do not match, perhaps a dodgy unsafeCoerce?"
        ; v_info <- getCgIdInfo v
-       ; emit (mkAssign (CmmLocal (idToReg (NonVoid bndr))) (idInfoToAmode v_info))
+       ; emitAssign (CmmLocal (idToReg (NonVoid bndr))) (idInfoToAmode v_info)
        ; _ <- bindArgsToRegs [NonVoid bndr]
        ; cgAlts NoGcInAlts (NonVoid bndr) alt_type alts }
   where
@@ -330,8 +378,11 @@ cgCase scrut@(StgApp v []) _ _ (PrimAlt _) _
     do { mb_cc <- maybeSaveCostCentre True
        ; withSequel (AssignTo [idToReg (NonVoid v)] False) (cgExpr scrut)
        ; restoreCurrentCostCentre mb_cc
-       ; emit $ mkComment $ mkFastString "should be unreachable code"
-       ; emit $ withFreshLabel "l" (\l -> mkLabel l <*> mkBranch l)}
+       ; emitComment $ mkFastString "should be unreachable code"
+       ; l <- newLabelC
+       ; emitLabel l
+       ; emit (mkBranch l)
+       }
 
 {-
 case seq# a s of v
@@ -349,7 +400,7 @@ cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr srt alt_type alts
   = -- handle seq#, same return convention as vanilla 'a'.
     cgCase (StgApp a []) bndr srt alt_type alts
 
-cgCase scrut bndr srt alt_type alts 
+cgCase scrut bndr _srt alt_type alts
   = -- the general case
     do { up_hp_usg <- getVirtHp        -- Upstream heap usage
        ; let ret_bndrs = chooseReturnBndrs bndr alt_type alts
@@ -359,7 +410,7 @@ cgCase scrut bndr srt alt_type alts
                       | isSingleton alts = False
                       | up_hp_usg > 0    = False
                       | otherwise        = True
-             gc_plan = if gcInAlts then GcInAlts alt_regs srt else NoGcInAlts
+             gc_plan = if gcInAlts then GcInAlts alt_regs else NoGcInAlts
 
        ; mb_cc <- maybeSaveCostCentre simple_scrut
        ; withSequel (AssignTo alt_regs gcInAlts) (cgExpr scrut)
@@ -417,14 +468,14 @@ chooseReturnBndrs _ _ _ = panic "chooseReturnBndrs"
 cgAlts :: GcPlan -> NonVoid Id -> AltType -> [StgAlt] -> FCode ()
 -- At this point the result of the case are in the binders
 cgAlts gc_plan _bndr PolyAlt [(_, _, _, rhs)]
-  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+  = maybeAltHeapCheck gc_plan Nothing (cgExpr rhs)
   
 cgAlts gc_plan _bndr (UbxTupAlt _) [(_, _, _, rhs)]
-  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+  = maybeAltHeapCheck gc_plan Nothing (cgExpr rhs)
 	-- Here bndrs are *already* in scope, so don't rebind them
 
 cgAlts gc_plan bndr (PrimAlt _) alts
-  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts
+  = do  { tagged_cmms <- cgAltRhss gc_plan Nothing bndr alts
 
 	; let bndr_reg = CmmLocal (idToReg bndr)
 	      (DEFAULT,deflt) = head tagged_cmms
@@ -433,20 +484,17 @@ cgAlts gc_plan bndr (PrimAlt _) alts
 
 	      tagged_cmms' = [(lit,code) 
 			     | (LitAlt lit, code) <- tagged_cmms]
-	; emit (mkCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt) }
+        ; emitCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt }
 
 cgAlts gc_plan bndr (AlgAlt tycon) alts
-  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts
-	
+  = do  { retry_lbl <- newLabelC
+        ; emitLabel retry_lbl -- Note [alg-alt heap checks]
+
+        ; (mb_deflt, branches) <- cgAlgAltRhss gc_plan (Just retry_lbl)
+                                               bndr alts
+
 	; let fam_sz   = tyConFamilySize tycon
 	      bndr_reg = CmmLocal (idToReg bndr)
-	      mb_deflt = case tagged_cmms of
-			   ((DEFAULT,rhs) : _) -> Just rhs
-			   _other	       -> Nothing
-		-- DEFAULT is always first, if present
-
-	      branches = [ (dataConTagZ con, cmm) 
-	   	         | (DataAlt con, cmm) <- tagged_cmms ]
 
                     -- Is the constructor tag in the node reg?
         ; if isSmallFamily fam_sz
@@ -467,23 +515,68 @@ cgAlts gc_plan bndr (AlgAlt tycon) alts
 cgAlts _ _ _ _ = panic "cgAlts"
 	-- UbxTupAlt and PolyAlt have only one alternative
 
+
+-- Note [alg-alt heap check]
+--
+-- In an algebraic case with more than one alternative, we will have
+-- code like
+--
+-- L0:
+--   x = R1
+--   goto L1
+-- L1:
+--   if (x & 7 >= 2) then goto L2 else goto L3
+-- L2:
+--   Hp = Hp + 16
+--   if (Hp > HpLim) then goto L4
+--   ...
+-- L4:
+--   call gc() returns to L5
+-- L5:
+--   x = R1
+--   goto L1
+
 -------------------
-cgAltRhss :: GcPlan -> NonVoid Id -> [StgAlt] -> FCode [(AltCon, CmmAGraph)]
-cgAltRhss gc_plan bndr alts
+cgAlgAltRhss :: GcPlan -> Maybe BlockId -> NonVoid Id -> [StgAlt]
+             -> FCode ( Maybe CmmAGraph
+                      , [(ConTagZ, CmmAGraph)] )
+cgAlgAltRhss gc_plan retry_lbl bndr alts
+  = do { tagged_cmms <- cgAltRhss gc_plan retry_lbl bndr alts
+
+       ; let { mb_deflt = case tagged_cmms of
+                           ((DEFAULT,rhs) : _) -> Just rhs
+                           _other              -> Nothing
+                            -- DEFAULT is always first, if present
+
+              ; branches = [ (dataConTagZ con, cmm)
+                           | (DataAlt con, cmm) <- tagged_cmms ]
+              }
+
+       ; return (mb_deflt, branches)
+       }
+
+
+-------------------
+cgAltRhss :: GcPlan -> Maybe BlockId -> NonVoid Id -> [StgAlt]
+          -> FCode [(AltCon, CmmAGraph)]
+cgAltRhss gc_plan retry_lbl bndr alts
   = forkAlts (map cg_alt alts)
   where
     base_reg = idToReg bndr
     cg_alt :: StgAlt -> FCode (AltCon, CmmAGraph)
     cg_alt (con, bndrs, _uses, rhs)
       = getCodeR		  $
-	maybeAltHeapCheck gc_plan $
+        maybeAltHeapCheck gc_plan retry_lbl $
 	do { _ <- bindConArgs con base_reg bndrs
 	   ; cgExpr rhs
 	   ; return con }
 
-maybeAltHeapCheck :: GcPlan -> FCode a -> FCode a
-maybeAltHeapCheck NoGcInAlts        code = code
-maybeAltHeapCheck (GcInAlts regs _) code = altHeapCheck regs code
+maybeAltHeapCheck :: GcPlan -> Maybe BlockId -> FCode a -> FCode a
+maybeAltHeapCheck NoGcInAlts      _    code = code
+maybeAltHeapCheck (GcInAlts regs) mlbl code =
+  case mlbl of
+     Nothing -> altHeapCheck regs code
+     Just retry_lbl -> altHeapCheckReturnsTo regs retry_lbl code
 
 -----------------------------------------------------------------------------
 -- 	Tail calls
@@ -517,8 +610,8 @@ cgIdApp fun_id args
 cgLneJump :: BlockId -> [LocalReg] -> [StgArg] -> FCode ()
 cgLneJump blk_id lne_regs args	-- Join point; discard sequel
   = do	{ cmm_args <- getNonVoidArgAmodes args
-      	; emit (mkMultiAssign lne_regs cmm_args
-		<*> mkBranch blk_id) }
+        ; emitMultiAssign lne_regs cmm_args
+        ; emit (mkBranch blk_id) }
     
 cgTailCall :: Id -> CgIdInfo -> [StgArg] -> FCode ()
 cgTailCall fun_id fun_info args = do
@@ -529,27 +622,21 @@ cgTailCall fun_id fun_info args = do
       	ReturnIt -> emitReturn [fun]	-- ToDo: does ReturnIt guarantee tagged?
     
       	EnterIt -> ASSERT( null args )	-- Discarding arguments
-      		do { let fun' = CmmLoad fun (cmmExprType fun)
-                   ; [ret,call] <- forkAlts [
-      			getCode $ emitReturn [fun],	-- Is tagged; no need to untag
-      			getCode $ do -- emit (mkAssign nodeReg fun)
-                         emitCall (NativeNodeCall, NativeReturn)
-                                  (entryCode fun') [fun]]  -- Not tagged
-      		   ; emit (mkCmmIfThenElse (cmmIsTagged fun) ret call) }
-
-      	SlowCall -> do 	    -- A slow function call via the RTS apply routines
+                   emitEnter fun
+
+        SlowCall -> do      -- A slow function call via the RTS apply routines
       		{ tickySlowCall lf_info args
-                ; emit $ mkComment $ mkFastString "slowCall"
+                ; emitComment $ mkFastString "slowCall"
       		; slowCall fun args }
     
       	-- A direct function call (possibly with some left-over arguments)
       	DirectEntry lbl arity -> do
 		{ tickyDirectCall arity args
  		; if node_points then
-                    do emit $ mkComment $ mkFastString "directEntry"
-                       emit (mkAssign nodeReg fun)
+                    do emitComment $ mkFastString "directEntry"
+                       emitAssign nodeReg fun
                        directCall lbl arity args
-		  else do emit $ mkComment $ mkFastString "directEntry else"
+                  else do emitComment $ mkFastString "directEntry else"
                           directCall lbl arity args }
 
 	JumpToIt {} -> panic "cgTailCall"	-- ???
@@ -561,33 +648,67 @@ cgTailCall fun_id fun_info args = do
     node_points = nodeMustPointToIt lf_info
 
 
-{- Note [case on Bool]
-   ~~~~~~~~~~~~~~~~~~~
-A case on a Boolean value does two things:
-  1. It looks up the Boolean in a closure table and assigns the
-     result to the binder.
-  2. It branches to the True or False case through analysis
-     of the closure assigned to the binder.
-But the indirection through the closure table is unnecessary
-if the assignment to the binder will be dead code (use isDeadBndr).
+emitEnter :: CmmExpr -> FCode ()
+emitEnter fun = do
+  { adjustHpBackwards
+  ; sequel <- getSequel
+  ; updfr_off <- getUpdFrameOff
+  ; case sequel of
+      -- For a return, we have the option of generating a tag-test or
+      -- not.  If the value is tagged, we can return directly, which
+      -- is quicker than entering the value.  This is a code
+      -- size/speed trade-off: when optimising for speed rather than
+      -- size we could generate the tag test.
+      --
+      -- Right now, we do what the old codegen did, and omit the tag
+      -- test, just generating an enter.
+      Return _ -> do
+        { let entry = entryCode $ closureInfoPtr $ CmmReg nodeReg
+        ; emit $ mkForeignJump NativeNodeCall entry
+                    [cmmUntag fun] updfr_off
+        }
+
+      -- The result will be scrutinised in the sequel.  This is where
+      -- we generate a tag-test to avoid entering the closure if
+      -- possible.
+      --
+      -- The generated code will be something like this:
+      --
+      --    R1 = fun  -- copyout
+      --    if (fun & 7 != 0) goto Lcall else goto Lret
+      --  Lcall:
+      --    call [fun] returns to Lret
+      --  Lret:
+      --    fun' = R1  -- copyin
+      --    ...
+      --
+      -- Note in particular that the label Lret is used as a
+      -- destination by both the tag-test and the call.  This is
+      -- becase Lret will necessarily be a proc-point, and we want to
+      -- ensure that we generate only one proc-point for this
+      -- sequence.
+      --
+      AssignTo res_regs _ -> do
+       { lret <- newLabelC
+       ; lcall <- newLabelC
+       ; let area = Young lret
+       ; let (off, copyin) = copyInOflow NativeReturn area res_regs
+             (outArgs, copyout) = copyOutOflow NativeNodeCall Call area
+                                          [fun] updfr_off (0,[])
+         -- refer to fun via nodeReg after the copyout, to avoid having
+         -- both live simultaneously; this sometimes enables fun to be
+         -- inlined in the RHS of the R1 assignment.
+       ; let entry = entryCode (closureInfoPtr (CmmReg nodeReg))
+             the_call = toCall entry (Just lret) updfr_off off outArgs
+       ; emit $
+           copyout <*>
+           mkCbranch (cmmIsTagged (CmmReg nodeReg)) lret lcall <*>
+           outOfLine lcall the_call <*>
+           mkLabel lret <*>
+           copyin
+       }
+  }
 
-The following example illustrates how badly the code turns out:
-  STG:
-    case <=## [ww_s7Hx y_s7HD] of wild2_sbH8 {
-      GHC.Types.False -> <true  code> // sbH8 dead
-      GHC.Types.True  -> <false code> // sbH8 dead
-    };
-  Cmm:
-    _s7HD::F64 = F64[_sbH7::I64 + 7];  // MidAssign
-    _ccsW::I64 = %MO_F_Le_W64(_s7Hx::F64, _s7HD::F64);  // MidAssign
-    // emitReturn  // MidComment
-    _sbH8::I64 = I64[ghczmprim_GHCziBool_Bool_closure_tbl + (_ccsW::I64 << 3)];  // MidAssign
-    _ccsX::I64 = _sbH8::I64 & 7;  // MidAssign
-    if (_ccsX::I64 >= 2) goto ccsH; else goto ccsI;  // LastCondBranch
-
-The assignments to _sbH8 and _ccsX are completely unnecessary.
-Instead, we should branch based on the value of _ccsW.
--}
 
 {- Note [Better Alt Heap Checks]
 If two function calls can share a return point, then they will also
diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs
index 5bc0f7af4e..c67e0e0c95 100644
--- a/compiler/codeGen/StgCmmForeign.hs
+++ b/compiler/codeGen/StgCmmForeign.hs
@@ -22,6 +22,7 @@ import StgCmmEnv
 import StgCmmMonad
 import StgCmmUtils
 import StgCmmClosure
+import StgCmmLayout
 
 import BlockId
 import Cmm
@@ -45,15 +46,16 @@ import Control.Monad
 -- Code generation for Foreign Calls
 -----------------------------------------------------------------------------
 
-cgForeignCall :: [LocalReg]             -- r1,r2  where to put the results
-              -> [ForeignHint]
-              -> ForeignCall            -- the op
+-- | emit code for a foreign call, and return the results to the sequel.
+--
+cgForeignCall :: ForeignCall            -- the op
               -> [StgArg]               -- x,y    arguments
+              -> Type                   -- result type
               -> FCode ()
--- Emits code for an unsafe foreign call:      r1, r2 = foo( x, y, z )
 
-cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_args
+cgForeignCall (CCall (CCallSpec target cconv safety)) stg_args res_ty
   = do  { cmm_args <- getFCallArgs stg_args
+        ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
         ; let ((call_args, arg_hints), cmm_target)
                 = case target of
                    StaticTarget _   _      False ->
@@ -63,7 +65,7 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
                                 = case mPkgId of
                                         Nothing         -> ForeignLabelInThisPackage
                                         Just pkgId      -> ForeignLabelInPackage pkgId
-                            size        = call_size cmm_args
+                            size = call_size cmm_args
                         in  ( unzip cmm_args
                             , CmmLit (CmmLabel
                                         (mkForeignLabel lbl size labelSource IsFunction)))
@@ -71,13 +73,31 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
                    DynamicTarget    ->  case cmm_args of
                                            (fn,_):rest -> (unzip rest, fn)
                                            [] -> panic "cgForeignCall []"
-              fc = ForeignConvention cconv arg_hints result_hints
+              fc = ForeignConvention cconv arg_hints res_hints
               call_target = ForeignTarget cmm_target fc
 
-        ; srt <- getSRTInfo NoSRT        -- SLPJ: Not sure what SRT
-                                        -- is right here
-                                        -- JD: Does it matter in the new codegen?
-        ; emitForeignCall safety results call_target call_args srt CmmMayReturn }
+        -- we want to emit code for the call, and then emitReturn.
+        -- However, if the sequel is AssignTo, we shortcut a little
+        -- and generate a foreign call that assigns the results
+        -- directly.  Otherwise we end up generating a bunch of
+        -- useless "r = r" assignments, which are not merely annoying:
+        -- they prevent the common block elimination from working correctly
+        -- in the case of a safe foreign call.
+        -- See Note [safe foreign call convention]
+        --
+        ; sequel <- getSequel
+        ; case sequel of
+            AssignTo assign_to_these _ ->
+                do { emitForeignCall safety assign_to_these call_target
+                                     call_args CmmMayReturn
+                   }
+
+            _something_else ->
+                do { emitForeignCall safety res_regs call_target
+                                     call_args CmmMayReturn
+                   ; emitReturn (map (CmmReg . CmmLocal) res_regs)
+                   }
+         }
   where
         -- in the stdcall calling convention, the symbol needs @size appended
         -- to it, where size is the total number of bytes of arguments.  We
@@ -88,16 +108,83 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
         | otherwise            = Nothing
 
         -- ToDo: this might not be correct for 64-bit API
-      arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType arg) wORD_SIZE
+      arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType arg)
+                               wORD_SIZE
+
+{- Note [safe foreign call convention]
+
+The simple thing to do for a safe foreign call would be the same as an
+unsafe one: just
+
+    emitForeignCall ...
+    emitReturn ...
+
+but consider what happens in this case
+
+   case foo x y z of
+     (# s, r #) -> ...
+
+The sequel is AssignTo [r].  The call to newUnboxedTupleRegs picks [r]
+as the result reg, and we generate
+
+  r = foo(x,y,z) returns to L1  -- emitForeignCall
+ L1:
+  r = r  -- emitReturn
+  goto L2
+L2:
+  ...
+
+Now L1 is a proc point (by definition, it is the continuation of the
+safe foreign call).  If L2 does a heap check, then L2 will also be a
+proc point.
+
+Furthermore, the stack layout algorithm has to arrange to save r
+somewhere between the call and the jump to L1, which is annoying: we
+would have to treat r differently from the other live variables, which
+have to be saved *before* the call.
+
+So we adopt a special convention for safe foreign calls: the results
+are copied out according to the NativeReturn convention by the call,
+and the continuation of the call should copyIn the results.  (The
+copyOut code is actually inserted when the safe foreign call is
+lowered later).  The result regs attached to the safe foreign call are
+only used temporarily to hold the results before they are copied out.
+
+We will now generate this:
+
+  r = foo(x,y,z) returns to L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+And when the safe foreign call is lowered later (see Note [lower safe
+foreign calls]) we get this:
+
+  suspendThread()
+  r = foo(x,y,z)
+  resumeThread()
+  R1 = r  -- copyOut, inserted by lowerSafeForeignCall
+  jump L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+Now consider what happens if L2 does a heap check: the Adams
+optimisation kicks in and commons up L1 with the heap-check
+continuation, resulting in just one proc point instead of two. Yay!
+-}
+
 
 emitCCall :: [(CmmFormal,ForeignHint)]
           -> CmmExpr
           -> [(CmmActual,ForeignHint)]
           -> FCode ()
 emitCCall hinted_results fn hinted_args
-  = emitForeignCall PlayRisky results target args
-                    NoC_SRT -- No SRT b/c we PlayRisky
-                    CmmMayReturn
+  = emitForeignCall PlayRisky results target args CmmMayReturn
   where
     (args, arg_hints) = unzip hinted_args
     (results, result_hints) = unzip hinted_results
@@ -107,7 +194,7 @@ emitCCall hinted_results fn hinted_args
 
 emitPrimCall :: [CmmFormal] -> CallishMachOp -> [CmmActual] -> FCode ()
 emitPrimCall res op args
-  = emitForeignCall PlayRisky res (PrimTarget op) args NoC_SRT CmmMayReturn
+  = emitForeignCall PlayRisky res (PrimTarget op) args CmmMayReturn
 
 -- alternative entry point, used by CmmParse
 emitForeignCall
@@ -115,11 +202,10 @@ emitForeignCall
         -> [CmmFormal]          -- where to put the results
         -> ForeignTarget        -- the op
         -> [CmmActual]          -- arguments
-        -> C_SRT                -- the SRT of the calls continuation
         -> CmmReturnInfo        -- This can say "never returns"
                                 --   only RTS procedures do this
         -> FCode ()
-emitForeignCall safety results target args _srt _ret
+emitForeignCall safety results target args _ret
   | not (playSafe safety) = do
     let (caller_save, caller_load) = callerSaveVolatileRegs
     emit caller_save
@@ -129,7 +215,9 @@ emitForeignCall safety results target args _srt _ret
   | otherwise = do
     updfr_off <- getUpdFrameOff
     temp_target <- load_target_into_temp target
-    emit $ mkSafeCall temp_target results args updfr_off (playInterruptible safety)
+    emit =<< mkSafeCall temp_target results args updfr_off
+                (playInterruptible safety)
+
 
 
 {-
@@ -162,7 +250,7 @@ maybe_assign_temp e
         -- expressions, which are wrong here.
         -- this is a NonPtr because it only duplicates an existing
         reg <- newTemp (cmmExprType e) --TODO FIXME NOW
-        emit (mkAssign (CmmLocal reg) e)
+        emitAssign (CmmLocal reg) e
         return (CmmReg (CmmLocal reg))
 
 -- -----------------------------------------------------------------------------
@@ -184,12 +272,12 @@ saveThreadState =
 emitSaveThreadState :: BlockId -> FCode ()
 emitSaveThreadState bid = do
   -- CurrentTSO->stackobj->sp = Sp;
-  emit $ mkStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord) stack_SP)
-                 (CmmStackSlot (CallArea (Young bid)) (widthInBytes (typeWidth gcWord)))
+  emitStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord) stack_SP)
+                 (CmmStackSlot (Young bid) (widthInBytes (typeWidth gcWord)))
   emit closeNursery
   -- and save the current cost centre stack in the TSO when profiling:
   when opt_SccProfilingOn $
-        emit (mkStore (cmmOffset stgCurrentTSO tso_CCCS) curCCS)
+        emitStore (cmmOffset stgCurrentTSO tso_CCCS) curCCS
 
    -- CurrentNursery->free = Hp+1;
 closeNursery :: CmmAGraph
diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs
index 25161722f7..611304b5e0 100644
--- a/compiler/codeGen/StgCmmHeap.hs
+++ b/compiler/codeGen/StgCmmHeap.hs
@@ -10,7 +10,7 @@ module StgCmmHeap (
         getVirtHp, setVirtHp, setRealHp,
         getHpRelOffset, hpRel,
 
-        entryHeapCheck, altHeapCheck,
+        entryHeapCheck, altHeapCheck, altHeapCheckReturnsTo,
 
         mkVirtHeapOffsets, mkVirtConstrOffsets,
         mkStaticClosureFields, mkStaticClosure,
@@ -20,7 +20,6 @@ module StgCmmHeap (
 
 #include "HsVersions.h"
 
-import CmmType
 import StgSyn
 import CLabel
 import StgCmmLayout
@@ -34,6 +33,7 @@ import StgCmmEnv
 
 import MkGraph
 
+import Hoopl hiding ((<*>), mkBranch)
 import SMRep
 import Cmm
 import CmmUtils
@@ -109,7 +109,7 @@ allocDynClosureCmm info_tbl lf_info use_cc _blame_cc amodes_w_offsets
 
         -- ALLOCATE THE OBJECT
         ; base <- getHpRelOffset info_offset
-        ; emit (mkComment $ mkFastString "allocDynClosure")
+        ; emitComment $ mkFastString "allocDynClosure"
         ; emitSetDynHdr base info_ptr  use_cc
         ; let (cmm_args, offsets) = unzip amodes_w_offsets
         ; hpStore base cmm_args offsets
@@ -151,9 +151,10 @@ mkStaticClosureFields
         :: CmmInfoTable
         -> CostCentreStack
         -> CafInfo
+        -> Bool                 -- SRT is non-empty?
         -> [CmmLit]             -- Payload
         -> [CmmLit]             -- The full closure
-mkStaticClosureFields info_tbl ccs caf_refs payload
+mkStaticClosureFields info_tbl ccs caf_refs has_srt payload
   = mkStaticClosure info_lbl ccs payload padding
         static_link_field saved_info_field
   where
@@ -178,8 +179,10 @@ mkStaticClosureFields info_tbl ccs caf_refs payload
         | otherwise  = ASSERT(null payload) [mkIntCLit 0]
 
     static_link_field
-        | is_caf || staticClosureNeedsLink info_tbl = [static_link_value]
-        | otherwise                                 = []
+        | is_caf || staticClosureNeedsLink has_srt info_tbl
+        = [static_link_value]
+        | otherwise
+        = []
 
     saved_info_field
         | is_caf     = [mkIntCLit 0]
@@ -335,11 +338,12 @@ entryHeapCheck cl_info offset nodeSet arity args code
 
            args' = map (CmmReg . CmmLocal) args
            setN = case nodeSet of
-                          Just n  -> mkAssign nodeReg (CmmReg $ CmmLocal n)
+                          Just _  -> mkNop -- No need to assign R1, it already
+                                           -- points to the closure
                           Nothing -> mkAssign nodeReg $
                               CmmLit (CmmLabel $ staticClosureLabel cl_info)
 
-           {- Thunks:          Set R1 = node, jump GCEnter1
+           {- Thunks:          jump GCEnter1
               Function (fast): Set R1 = node, jump GCFun
               Function (slow): Set R1 = node, call generic_gc -}
            gc_call upd = setN <*> gc_lbl upd
@@ -354,7 +358,10 @@ entryHeapCheck cl_info offset nodeSet arity args code
             - GC calls, but until then this fishy code works -}
 
        updfr_sz <- getUpdFrameOff
-       heapCheck True (gc_call updfr_sz) code
+
+       loop_id <- newLabelC
+       emitLabel loop_id
+       heapCheck True (gc_call updfr_sz <*> mkBranch loop_id) code
 
 {-
     -- This code is slightly outdated now and we could easily keep the above
@@ -400,21 +407,29 @@ entryHeapCheck cl_info offset nodeSet arity args code
 -}
 
 
---------------------------------------------------------------
--- A heap/stack check at in a case alternative
+-- ------------------------------------------------------------
+-- A heap/stack check in a case alternative
 
 altHeapCheck :: [LocalReg] -> FCode a -> FCode a
 altHeapCheck regs code
+  = do loop_id <- newLabelC
+       emitLabel loop_id
+       altHeapCheckReturnsTo regs loop_id code
+
+altHeapCheckReturnsTo :: [LocalReg] -> Label -> FCode a -> FCode a
+altHeapCheckReturnsTo regs retry_lbl code
   = do updfr_sz <- getUpdFrameOff
-       heapCheck False (gc_call updfr_sz) code
+       gc_call_code <- gc_call updfr_sz
+       heapCheck False (gc_call_code <*> mkBranch retry_lbl) code
 
   where
     reg_exprs = map (CmmReg . CmmLocal) regs
+      -- Note [stg_gc arguments]
 
     gc_call sp =
         case rts_label regs of
-             Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp
-             Nothing -> mkCall generic_gc (GC, GC) [] [] sp
+             Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp (0,[])
+             Nothing -> mkCall generic_gc (GC, GC) [] [] sp (0,[])
 
     rts_label [reg]
         | isGcPtrType ty = Just (mkGcLabel "stg_gc_unpt_r1")
@@ -432,6 +447,23 @@ altHeapCheck regs code
 
     rts_label _ = Nothing
 
+-- Note [stg_gc arguments]
+-- It might seem that we could avoid passing the arguments to the
+-- stg_gc function, because they are already in the right registers.
+-- While this is usually the case, it isn't always.  Sometimes the
+-- code generator has cleverly avoided the eval in a case, e.g. in
+-- ffi/should_run/4221.hs we found
+--
+--   case a_r1mb of z
+--     FunPtr x y -> ...
+--
+-- where a_r1mb is bound a top-level constructor, and is known to be
+-- evaluated.  The codegen just assigns x, y and z, and continues;
+-- R1 is never assigned.
+--
+-- So we'll have to rely on optimisations to eliminatethese
+-- assignments where possible.
+
 
 -- | The generic GC procedure; no params, no results
 generic_gc :: CmmExpr
@@ -447,7 +479,7 @@ heapCheck checkStack do_gc code
   = getHeapUsage $ \ hpHw ->
     -- Emit heap checks, but be sure to do it lazily so
     -- that the conditionals on hpHw don't cause a black hole
-    do  { emit $ do_checks checkStack hpHw do_gc
+    do  { codeOnly $ do_checks checkStack hpHw do_gc
         ; tickyAllocHeap hpHw
         ; doGranAllocate hpHw
         ; setRealHp hpHw
@@ -456,22 +488,25 @@ heapCheck checkStack do_gc code
 do_checks :: Bool       -- Should we check the stack?
           -> WordOff    -- Heap headroom
           -> CmmAGraph  -- What to do on failure
-          -> CmmAGraph
-do_checks checkStack alloc do_gc
-  = withFreshLabel "gc" $ \ loop_id ->
-    withFreshLabel "gc" $ \ gc_id   ->
-      mkLabel loop_id
-      <*> (let hpCheck = if alloc == 0 then mkNop
-                         else mkAssign hpReg bump_hp <*>
-                              mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
-           in if checkStack
-                 then mkCmmIfThenElse sp_oflo (mkBranch gc_id) hpCheck
-                 else hpCheck)
-      <*> mkComment (mkFastString "outOfLine should follow:")
-      <*> outOfLine (mkLabel gc_id
-                     <*> mkComment (mkFastString "outOfLine here")
-                     <*> do_gc
-                     <*> mkBranch loop_id)
+          -> FCode ()
+do_checks checkStack alloc do_gc = do
+  gc_id <- newLabelC
+  hp_check <- if alloc == 0
+                 then return mkNop
+                 else do
+                   ifthen <- mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
+                   return (mkAssign hpReg bump_hp <*> ifthen)
+
+  if checkStack
+     then emit =<< mkCmmIfThenElse sp_oflo (mkBranch gc_id) hp_check
+     else emit hp_check
+
+  emit $ mkComment (mkFastString "outOfLine should follow:")
+
+  emitOutOfLine gc_id $
+     mkComment (mkFastString "outOfLine here") <*>
+     do_gc -- this is expected to jump back somewhere
+
                 -- Test for stack pointer exhaustion, then
                 -- bump heap pointer, and test for heap exhaustion
                 -- Note that we don't move the heap pointer unless the
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index 86986efdfa..9593af1f50 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -15,7 +15,7 @@
 
 module StgCmmLayout (
 	mkArgDescr, 
-	emitCall, emitReturn,
+        emitCall, emitReturn, adjustHpBackwards,
 
 	emitClosureProcAndInfoTable,
 	emitClosureAndInfoTable,
@@ -41,10 +41,12 @@ import StgCmmEnv
 import StgCmmTicky
 import StgCmmMonad
 import StgCmmUtils
+import StgCmmProf
 
 import MkGraph
 import SMRep
 import Cmm
+import CmmUtils
 import CLabel
 import StgSyn
 import Id
@@ -52,6 +54,7 @@ import Name
 import TyCon		( PrimRep(..) )
 import BasicTypes	( RepArity )
 import StaticFlags
+import Module
 
 import Constants
 import Util
@@ -63,38 +66,60 @@ import FastString
 --		Call and return sequences
 ------------------------------------------------------------------------
 
-emitReturn :: [CmmExpr] -> FCode ()
--- Return multiple values to the sequel
+-- | Return multiple values to the sequel
+--
+-- If the sequel is @Return@
+--
+-- >     return (x,y)
+--
+-- If the sequel is @AssignTo [p,q]@
+--
+-- >    p=x; q=y;
 --
--- If the sequel is Return
---	return (x,y)
--- If the sequel is AssignTo [p,q]
---	p=x; q=y; 
+emitReturn :: [CmmExpr] -> FCode ()
 emitReturn results
   = do { sequel    <- getSequel;
        ; updfr_off <- getUpdFrameOff
-       ; emit $ mkComment $ mkFastString ("emitReturn: " ++ show sequel)
+       ; emitComment $ mkFastString ("emitReturn: " ++ show sequel)
        ; case sequel of
            Return _ ->
              do { adjustHpBackwards
                 ; emit (mkReturnSimple results updfr_off) }
            AssignTo regs adjust ->
              do { if adjust then adjustHpBackwards else return ()
-                ; emit (mkMultiAssign  regs results) }
+                ; emitMultiAssign  regs results }
        }
 
+
+-- | @emitCall conv fun args@ makes a call to the entry-code of @fun@,
+-- using the call/return convention @conv@, passing @args@, and
+-- returning the results to the current sequel.
+--
 emitCall :: (Convention, Convention) -> CmmExpr -> [CmmExpr] -> FCode ()
--- (cgCall fun args) makes a call to the entry-code of 'fun', 
--- passing 'args', and returning the results to the current sequel
-emitCall convs@(callConv, _) fun args
+emitCall convs fun args
+  = emitCallWithExtraStack convs fun args noExtraStack
+
+
+-- | @emitCallWithExtraStack conv fun args stack@ makes a call to the
+-- entry-code of @fun@, using the call/return convention @conv@,
+-- passing @args@, pushing some extra stack frames described by
+-- @stack@, and returning the results to the current sequel.
+--
+emitCallWithExtraStack
+   :: (Convention, Convention) -> CmmExpr -> [CmmExpr]
+   -> (ByteOff, [(CmmExpr,ByteOff)]) -> FCode ()
+emitCallWithExtraStack convs@(callConv, _) fun args extra_stack
   = do	{ adjustHpBackwards
 	; sequel <- getSequel
 	; updfr_off <- getUpdFrameOff
-        ; emit $ mkComment $ mkFastString ("emitCall: " ++ show sequel)
-	; case sequel of
-	    Return _            -> emit (mkForeignJump callConv fun args updfr_off)
-	    AssignTo res_regs _ -> emit (mkCall fun convs res_regs args updfr_off)
-    }
+        ; emitComment $ mkFastString ("emitCallWithExtraStack: " ++ show sequel)
+        ; case sequel of
+            Return _ ->
+              emit $ mkForeignJumpExtra callConv fun args updfr_off extra_stack
+            AssignTo res_regs _ -> do
+              emit =<< mkCall fun convs res_regs args updfr_off extra_stack
+      }
+
 
 adjustHpBackwards :: FCode ()
 -- This function adjusts and heap pointers just before a tail call or
@@ -127,59 +152,137 @@ adjustHpBackwards
 --	Making calls: directCall and slowCall
 -------------------------------------------------------------------------
 
+-- General plan is:
+--   - we'll make *one* fast call, either to the function itself
+--     (directCall) or to stg_ap_<pat>_fast (slowCall)
+--     Any left-over arguments will be pushed on the stack,
+--
+--     e.g. Sp[old+8]  = arg1
+--          Sp[old+16] = arg2
+--          Sp[old+32] = stg_ap_pp_info
+--          R2 = arg3
+--          R3 = arg4
+--          call f() return to Nothing updfr_off: 32
+
+
 directCall :: CLabel -> RepArity -> [StgArg] -> FCode ()
 -- (directCall f n args)
 -- calls f(arg1, ..., argn), and applies the result to the remaining args
 -- The function f has arity n, and there are guaranteed at least n args
 -- Both arity and args include void args
 directCall lbl arity stg_args 
-  = do	{ cmm_args <- getNonVoidArgAmodes stg_args
-	; direct_call "directCall" lbl arity cmm_args (argsReps stg_args) }
+  = do  { argreps <- getArgRepsAmodes stg_args
+        ; direct_call "directCall" lbl arity argreps }
+
 
 slowCall :: CmmExpr -> [StgArg] -> FCode ()
 -- (slowCall fun args) applies fun to args, returning the results to Sequel
 slowCall fun stg_args 
-  = do	{ cmm_args <- getNonVoidArgAmodes stg_args
-	; slow_call fun cmm_args (argsReps stg_args) }
+  = do  { dflags <- getDynFlags
+        ; argsreps <- getArgRepsAmodes stg_args
+        ; let (rts_fun, arity) = slowCallPattern (map fst argsreps)
+        ; call <- getCode $ direct_call "slow_call"
+                       (mkRtsApFastLabel rts_fun) arity argsreps
+        ; emitComment $ mkFastString ("slow_call for " ++
+                                      showSDoc dflags (ppr fun) ++
+                                      " with pat " ++ unpackFS rts_fun)
+        ; emit (mkAssign nodeReg fun <*> call)
+        }
+
 
 --------------
-direct_call :: String -> CLabel -> RepArity -> [CmmExpr] -> [ArgRep] -> FCode ()
--- NB1: (length args) may be less than (length reps), because
---     the args exclude the void ones
--- NB2: 'arity' refers to the *reps* 
-direct_call caller lbl arity args reps
-  | debugIsOn && arity > length reps	-- Too few args
+direct_call :: String -> CLabel -> RepArity -> [(ArgRep,Maybe CmmExpr)] -> FCode ()
+direct_call caller lbl arity args
+  | debugIsOn && arity > length args  -- Too few args
   = do -- Caller should ensure that there enough args!
-       pprPanic "direct_call" (text caller <+> ppr arity
-                           <+> ppr lbl <+> ppr (length reps)
-                           <+> ppr args <+> ppr reps )
-
-  | null rest_reps     -- Precisely the right number of arguments
-  = emitCall (NativeDirectCall, NativeReturn) target args
-
-  | otherwise		-- Over-saturated call
-  = ASSERT( arity == length initial_reps )
-    do	{ pap_id <- newTemp gcWord
-	; withSequel (AssignTo [pap_id] True)
-		     (emitCall (NativeDirectCall, NativeReturn) target fast_args)
-	; slow_call (CmmReg (CmmLocal pap_id)) 
-		    rest_args rest_reps }
+       pprPanic "direct_call" $
+            text caller <+> ppr arity <+>
+            ppr lbl <+> ppr (length args) <+>
+            ppr (map snd args) <+> ppr (map fst args)
+
+  | null rest_args  -- Precisely the right number of arguments
+  = emitCall (NativeDirectCall, NativeReturn) target (nonVArgs args)
+
+  | otherwise       -- Note [over-saturated calls]
+  = emitCallWithExtraStack (NativeDirectCall, NativeReturn)
+                           target (nonVArgs fast_args) (mkStkOffsets stack_args)
   where
     target = CmmLit (CmmLabel lbl)
-    (initial_reps, rest_reps) = splitAt arity reps
-    arg_arity = count isNonV initial_reps
-    (fast_args, rest_args) = splitAt arg_arity args
+    (fast_args, rest_args) = splitAt arity args
+    stack_args = slowArgs rest_args
 
---------------
-slow_call :: CmmExpr -> [CmmExpr] -> [ArgRep] -> FCode ()
-slow_call fun args reps
-  = do dflags <- getDynFlags
-       call <- getCode $ direct_call "slow_call" (mkRtsApFastLabel rts_fun) arity args reps
-       emit $ mkComment $ mkFastString ("slow_call for " ++ showSDoc dflags (ppr fun) ++
-                                        " with pat " ++ unpackFS rts_fun)
-       emit (mkAssign nodeReg fun <*> call)
+
+-- When constructing calls, it is easier to keep the ArgReps and the
+-- CmmExprs zipped together.  However, a void argument has no
+-- representation, so we need to use Maybe CmmExpr (the alternative of
+-- using zeroCLit or even undefined would work, but would be ugly).
+--
+getArgRepsAmodes :: [StgArg] -> FCode [(ArgRep, Maybe CmmExpr)]
+getArgRepsAmodes = mapM getArgRepAmode
+  where getArgRepAmode arg
+           | V <- rep  = return (V, Nothing)
+           | otherwise = do expr <- getArgAmode (NonVoid arg)
+                            return (rep, Just expr)
+           where rep = toArgRep (argPrimRep arg)
+
+nonVArgs :: [(ArgRep, Maybe CmmExpr)] -> [CmmExpr]
+nonVArgs [] = []
+nonVArgs ((_,Nothing)  : args) = nonVArgs args
+nonVArgs ((_,Just arg) : args) = arg : nonVArgs args
+
+{-
+Note [over-saturated calls]
+
+The natural thing to do for an over-saturated call would be to call
+the function with the correct number of arguments, and then apply the
+remaining arguments to the value returned, e.g.
+
+  f a b c d   (where f has arity 2)
+  -->
+  r = call f(a,b)
+  call r(c,d)
+
+but this entails
+  - saving c and d on the stack
+  - making a continuation info table
+  - at the continuation, loading c and d off the stack into regs
+  - finally, call r
+
+Note that since there are a fixed number of different r's
+(e.g.  stg_ap_pp_fast), we can also pre-compile continuations
+that correspond to each of them, rather than generating a fresh
+one for each over-saturated call.
+
+Not only does this generate much less code, it is faster too.  We will
+generate something like:
+
+Sp[old+16] = c
+Sp[old+24] = d
+Sp[old+32] = stg_ap_pp_info
+call f(a,b) -- usual calling convention
+
+For the purposes of the CmmCall node, we count this extra stack as
+just more arguments that we are passing on the stack (cml_args).
+-}
+
+-- | 'slowArgs' takes a list of function arguments and prepares them for
+-- pushing on the stack for "extra" arguments to a function which requires
+-- fewer arguments than we currently have.
+slowArgs :: [(ArgRep, Maybe CmmExpr)] -> [(ArgRep, Maybe CmmExpr)]
+slowArgs [] = []
+slowArgs args -- careful: reps contains voids (V), but args does not
+  | opt_SccProfilingOn = save_cccs ++ this_pat ++ slowArgs rest_args
+  | otherwise          =              this_pat ++ slowArgs rest_args
   where
-    (rts_fun, arity) = slowCallPattern reps
+    (arg_pat, n)            = slowCallPattern (map fst args)
+    (call_args, rest_args)  = splitAt n args
+
+    stg_ap_pat = mkCmmRetInfoLabel rtsPackageId arg_pat
+    this_pat   = (N, Just (mkLblExpr stg_ap_pat)) : call_args
+    save_cccs  = [(N, Just (mkLblExpr save_cccs_lbl)), (N, Just curCCS)]
+    save_cccs_lbl = mkCmmRetInfoLabel rtsPackageId (fsLit "stg_restore_cccs")
+
+
 
 -- These cases were found to cover about 99% of all slow calls:
 slowCallPattern :: [ArgRep] -> (FastString, RepArity)
@@ -202,6 +305,30 @@ slowCallPattern []		      = (fsLit "stg_ap_0", 0)
 
 
 -------------------------------------------------------------------------
+-- Fix the byte-offsets of a bunch of things to push on the stack
+
+-- This is used for pushing slow-call continuations.
+-- See Note [over-saturated calls].
+
+mkStkOffsets
+  :: [(ArgRep, Maybe CmmExpr)]    -- things to make offsets for
+  -> ( ByteOff                    -- OUTPUTS: Topmost allocated word
+     , [(CmmExpr, ByteOff)] )     -- things with offsets (voids filtered out)
+mkStkOffsets things
+    = loop 0 [] (reverse things)
+  where
+    loop offset offs [] = (offset,offs)
+    loop offset offs ((_,Nothing):things) = loop offset offs things
+	-- ignore Void arguments
+    loop offset offs ((rep,Just thing):things)
+        = loop thing_off ((thing, thing_off):offs) things
+	where
+          thing_off = offset + argRepSizeW rep * wORD_SIZE
+	    -- offset of thing is offset+size, because we're 
+	    -- growing the stack *downwards* as the offsets increase.
+
+
+-------------------------------------------------------------------------
 --	Classifying arguments: ArgRep
 -------------------------------------------------------------------------
 
@@ -237,10 +364,7 @@ isNonV :: ArgRep -> Bool
 isNonV V = False
 isNonV _ = True
 
-argsReps :: [StgArg] -> [ArgRep]
-argsReps = map (toArgRep . argPrimRep)
-
-argRepSizeW :: ArgRep -> WordOff		-- Size in words
+argRepSizeW :: ArgRep -> WordOff                -- Size in words
 argRepSizeW N = 1
 argRepSizeW P = 1
 argRepSizeW F = 1
diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs
index 4eea38e22c..cc9919a4a0 100644
--- a/compiler/codeGen/StgCmmMonad.hs
+++ b/compiler/codeGen/StgCmmMonad.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE GADTs #-}
 -----------------------------------------------------------------------------
 --
 -- Monad for Stg to C-- code generation
@@ -16,16 +17,21 @@
 module StgCmmMonad (
 	FCode,	-- type
 
-	initC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
+        initC, runC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
 	returnFC, fixC, fixC_, nopC, whenC, 
 	newUnique, newUniqSupply, 
 
+        newLabelC, emitLabel,
+
 	emit, emitDecl, emitProc, emitProcWithConvention, emitSimpleProc,
+        emitOutOfLine, emitAssign, emitStore, emitComment,
 
 	getCmm, cgStmtsToBlocks,
 	getCodeR, getCode, getHeapUsage,
 
-	forkClosureBody, forkStatics, forkAlts, forkProc, codeOnly,
+        mkCmmIfThenElse, mkCmmIfThen, mkCall, mkCmmCall, mkSafeCall,
+
+        forkClosureBody, forkStatics, forkAlts, forkProc, codeOnly,
 
 	ConTagZ,
 
@@ -69,12 +75,12 @@ import VarEnv
 import OrdList
 import Unique
 import UniqSupply
-import FastString(sLit)
+import FastString
 import Outputable
 
 import Control.Monad
 import Data.List
-import Prelude hiding( sequence )
+import Prelude hiding( sequence, succ )
 import qualified Prelude( sequence )
 
 infixr 9 `thenC`	-- Right-associative!
@@ -95,12 +101,12 @@ instance Monad FCode where
 {-# INLINE thenFC #-}
 {-# INLINE returnFC #-}
 
-initC :: DynFlags -> Module -> FCode a -> IO a
-initC dflags mod (FCode code)
-  = do	{ uniqs <- mkSplitUniqSupply 'c'
-	; case code (initCgInfoDown dflags mod) (initCgState uniqs) of
-	      (res, _) -> return res
-	}
+initC :: IO CgState
+initC  = do { uniqs <- mkSplitUniqSupply 'c'
+            ; return (initCgState uniqs) }
+
+runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState)
+runC dflags mod st (FCode code) = code (initCgInfoDown dflags mod) st
 
 returnFC :: a -> FCode a
 returnFC val = FCode (\_info_down state -> (val, state))
@@ -270,6 +276,8 @@ data HeapUsage =
 
 type VirtualHpOffset = WordOff
 
+
+
 initCgState :: UniqSupply -> CgState
 initCgState uniqs
   = MkCgState { cgs_stmts      = mkNop, cgs_tops = nilOL,
@@ -308,7 +316,6 @@ initHpUsage = HeapUsage { virtHp = 0, realHp = 0 }
 maxHpHw :: HeapUsage -> VirtualHpOffset -> HeapUsage
 hp_usg `maxHpHw` hw = hp_usg { virtHp = virtHp hp_usg `max` hw }
 
-
 --------------------------------------------------------
 -- Operators for getting and setting the state and "info_down".
 --------------------------------------------------------
@@ -591,6 +598,33 @@ getHeapUsage fcode
 -- ----------------------------------------------------------------------------
 -- Combinators for emitting code
 
+emitCgStmt :: CgStmt -> FCode ()
+emitCgStmt stmt
+  = do  { state <- getState
+        ; setState $ state { cgs_stmts = cgs_stmts state `snocOL` stmt }
+        }
+
+emitLabel :: BlockId -> FCode ()
+emitLabel id = emitCgStmt (CgLabel id)
+
+emitComment :: FastString -> FCode ()
+#if 0 /* def DEBUG */
+emitComment s = emitCgStmt (CgStmt (CmmComment s))
+#else
+emitComment _ = return ()
+#endif
+
+emitAssign :: CmmReg  -> CmmExpr -> FCode ()
+emitAssign l r = emitCgStmt (CgStmt (CmmAssign l r))
+
+emitStore :: CmmExpr  -> CmmExpr -> FCode ()
+emitStore l r = emitCgStmt (CgStmt (CmmStore l r))
+
+
+newLabelC :: FCode BlockId
+newLabelC = do { u <- newUnique
+               ; return $ mkBlockId u }
+
 emit :: CmmAGraph -> FCode ()
 emit ag
   = do	{ state <- getState
@@ -601,6 +635,9 @@ emitDecl decl
   = do 	{ state <- getState
 	; setState $ state { cgs_tops = cgs_tops state `snocOL` decl } }
 
+emitOutOfLine :: BlockId -> CmmAGraph -> FCode ()
+emitOutOfLine l stmts = emitCgStmt (CgFork l stmts)
+
 emitProcWithConvention :: Convention -> CmmInfoTable -> CLabel -> [CmmFormal] ->
                           CmmAGraph -> FCode ()
 emitProcWithConvention conv info lbl args blocks
@@ -629,6 +666,55 @@ getCmm code
 	; setState $ state2 { cgs_tops = cgs_tops state1 } 
         ; return (fromOL (cgs_tops state2)) }
 
+
+mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThenElse e tbranch fbranch = do
+  endif <- newLabelC
+  tid   <- newLabelC
+  fid   <- newLabelC
+  return $ mkCbranch e tid fid <*>
+            mkLabel tid <*> tbranch <*> mkBranch endif <*>
+            mkLabel fid <*> fbranch <*> mkLabel endif
+
+mkCmmIfThen :: CmmExpr -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThen e tbranch = do
+  endif <- newLabelC
+  tid <- newLabelC
+  return $ mkCbranch e tid endif <*>
+         mkLabel tid <*> tbranch <*> mkLabel endif
+
+
+mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual]
+       -> UpdFrameOffset -> (ByteOff,[(CmmExpr,ByteOff)]) -> FCode CmmAGraph
+mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do
+  k <- newLabelC
+  let area = Young k
+      (off, copyin) = copyInOflow retConv area results
+      copyout = mkCallReturnsTo f callConv actuals k off updfr_off extra_stack
+  return (copyout <*> mkLabel k <*> copyin)
+
+mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset
+          -> FCode CmmAGraph
+mkCmmCall f results actuals updfr_off
+   = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off (0,[])
+
+
+mkSafeCall :: ForeignTarget -> [CmmFormal] -> [CmmActual]
+           -> UpdFrameOffset -> Bool
+           -> FCode CmmAGraph
+mkSafeCall   t fs as upd i = do
+  k <- newLabelC
+  let (_off, copyout) = copyInOflow NativeReturn (Young k) fs
+    -- see Note [safe foreign call convention]
+  return
+     (    mkStore (CmmStackSlot (Young k) (widthInBytes wordWidth))
+                  (CmmLit (CmmBlock k))
+      <*> mkLast (CmmForeignCall { tgt=t, res=fs, args=as, succ=k
+                                 , updfr=upd, intrbl=i })
+      <*> mkLabel k
+      <*> copyout
+     )
+
 -- ----------------------------------------------------------------------------
 -- CgStmts
 
@@ -640,4 +726,3 @@ cgStmtsToBlocks :: CmmAGraph -> FCode CmmGraph
 cgStmtsToBlocks stmts
   = do  { us <- newUniqSupply
 	; return (initUs_ us (lgraphOfAGraph stmts)) }	
-
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index efa234b5a6..bd783a3b30 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -14,7 +14,9 @@
 -- for details
 
 module StgCmmPrim (
-   cgOpApp
+   cgOpApp,
+   cgPrimOp -- internal(ish), used by cgCase to get code for a
+            -- comparison without also turning it into a Bool.
  ) where
 
 #include "HsVersions.h"
@@ -67,14 +69,9 @@ cgOpApp :: StgOp	-- The op
 
 -- Foreign calls 
 cgOpApp (StgFCallOp fcall _) stg_args res_ty 
-  = do	{ (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
-	 	-- Choose result regs r1, r2
-		-- Note [Foreign call results]
-	; cgForeignCall res_regs res_hints fcall stg_args
-		-- r1, r2 = foo( x, y )
-	; emitReturn (map (CmmReg . CmmLocal) res_regs) }
-		-- return (r1, r2) 
-      
+  = cgForeignCall fcall stg_args res_ty
+      -- Note [Foreign call results]
+
 -- tagToEnum# is special: we need to pull the constructor 
 -- out of the table, and perform an appropriate return.
 
@@ -229,23 +226,23 @@ emitPrimOp [res] SparkOp [arg]
             [(tmp2,NoHint)]
             (CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "newSpark"))))
             [(CmmReg (CmmGlobal BaseReg), AddrHint), ((CmmReg (CmmLocal tmp)), AddrHint)]
-        emit (mkAssign (CmmLocal res) (CmmReg (CmmLocal tmp)))
+        emitAssign (CmmLocal res) (CmmReg (CmmLocal tmp))
 
 emitPrimOp [res] GetCCSOfOp [arg]
-  = emit (mkAssign (CmmLocal res) val)
+  = emitAssign (CmmLocal res) val
   where
     val | opt_SccProfilingOn = costCentreFrom (cmmUntag arg)
         | otherwise          = CmmLit zeroCLit
 
 emitPrimOp [res] GetCurrentCCSOp [_dummy_arg]
-   = emit (mkAssign (CmmLocal res) curCCS)
+   = emitAssign (CmmLocal res) curCCS
 
 emitPrimOp [res] ReadMutVarOp [mutv]
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexW mutv fixedHdrSize gcWord))
+   = emitAssign (CmmLocal res) (cmmLoadIndexW mutv fixedHdrSize gcWord)
 
 emitPrimOp [] WriteMutVarOp [mutv,var]
    = do
-	emit (mkStore (cmmOffsetW mutv fixedHdrSize) var)
+        emitStore (cmmOffsetW mutv fixedHdrSize) var
 	emitCCall
 		[{-no results-}]
 		(CmmLit (CmmLabel mkDirty_MUT_VAR_Label))
@@ -269,32 +266,32 @@ emitPrimOp res@[] TouchOp args@[_arg]
 
 --  #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a)
 emitPrimOp [res] ByteArrayContents_Char [arg]
-   = emit (mkAssign (CmmLocal res) (cmmOffsetB arg arrWordsHdrSize))
+   = emitAssign (CmmLocal res) (cmmOffsetB arg arrWordsHdrSize)
 
 --  #define stableNameToIntzh(r,s)   (r = ((StgStableName *)s)->sn)
 emitPrimOp [res] StableNameToIntOp [arg]
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexW arg fixedHdrSize bWord))
+   = emitAssign (CmmLocal res) (cmmLoadIndexW arg fixedHdrSize bWord)
 
 --  #define eqStableNamezh(r,sn1,sn2)					\
 --    (r = (((StgStableName *)sn1)->sn == ((StgStableName *)sn2)->sn))
 emitPrimOp [res] EqStableNameOp [arg1,arg2]
-   = emit (mkAssign (CmmLocal res) (CmmMachOp mo_wordEq [
+   = emitAssign (CmmLocal res) (CmmMachOp mo_wordEq [
 				cmmLoadIndexW arg1 fixedHdrSize bWord,
 				cmmLoadIndexW arg2 fixedHdrSize bWord
-			 ]))
+                         ])
 
 
 emitPrimOp [res] ReallyUnsafePtrEqualityOp [arg1,arg2]
-   = emit (mkAssign (CmmLocal res) (CmmMachOp mo_wordEq [arg1,arg2]))
+   = emitAssign (CmmLocal res) (CmmMachOp mo_wordEq [arg1,arg2])
 
 --  #define addrToHValuezh(r,a) r=(P_)a
 emitPrimOp [res] AddrToAnyOp [arg]
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
 --  #define dataToTagzh(r,a)  r=(GET_TAG(((StgClosure *)a)->header.info))
 --  Note: argument may be tagged!
 emitPrimOp [res] DataToTagOp [arg]
-   = emit (mkAssign (CmmLocal res) (getConstrTag (cmmUntag arg)))
+   = emitAssign (CmmLocal res) (getConstrTag (cmmUntag arg))
 
 {- Freezing arrays-of-ptrs requires changing an info table, for the
    benefit of the generational collector.  It needs to scavenge mutable
@@ -317,7 +314,7 @@ emitPrimOp [res] UnsafeFreezeArrayArrayOp [arg]
 
 --  #define unsafeFreezzeByteArrayzh(r,a)	r=(a)
 emitPrimOp [res] UnsafeFreezeByteArrayOp [arg]
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
 -- Copying pointer arrays
 
@@ -497,11 +494,11 @@ emitPrimOp [res] PopCntOp [w] = emitPopCntCall res w wordWidth
 -- The rest just translate straightforwardly
 emitPrimOp [res] op [arg]
    | nopOp op
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
    | Just (mop,rep) <- narrowOp op
-   = emit (mkAssign (CmmLocal res) $
-	   CmmMachOp (mop rep wordWidth) [CmmMachOp (mop wordWidth rep) [arg]])
+   = emitAssign (CmmLocal res) $
+           CmmMachOp (mop rep wordWidth) [CmmMachOp (mop wordWidth rep) [arg]]
 
 emitPrimOp r@[res] op args
    | Just prim <- callishOp op
@@ -746,15 +743,15 @@ loadArrPtrsSize addr = CmmLoad (cmmOffsetB addr off) bWord
 mkBasicIndexedRead :: ByteOff -> Maybe MachOp -> CmmType
 		   -> LocalReg -> CmmExpr -> CmmExpr -> FCode ()
 mkBasicIndexedRead off Nothing read_rep res base idx
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexOffExpr off read_rep base idx))
+   = emitAssign (CmmLocal res) (cmmLoadIndexOffExpr off read_rep base idx)
 mkBasicIndexedRead off (Just cast) read_rep res base idx
-   = emit (mkAssign (CmmLocal res) (CmmMachOp cast [
-				cmmLoadIndexOffExpr off read_rep base idx]))
+   = emitAssign (CmmLocal res) (CmmMachOp cast [
+                                cmmLoadIndexOffExpr off read_rep base idx])
 
 mkBasicIndexedWrite :: ByteOff -> Maybe MachOp
 		   -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
 mkBasicIndexedWrite off Nothing base idx val
-   = emit (mkStore (cmmIndexOffExpr off (typeWidth (cmmExprType val)) base idx) val)
+   = emitStore (cmmIndexOffExpr off (typeWidth (cmmExprType val)) base idx) val
 mkBasicIndexedWrite off (Just cast) base idx val
    = mkBasicIndexedWrite off Nothing base idx (CmmMachOp cast [val])
 
@@ -805,7 +802,7 @@ doCopyMutableByteArrayOp = emitCopyByteArray copy
             getCode $ emitMemmoveCall dst_p src_p bytes (CmmLit (mkIntCLit 1)),
             getCode $ emitMemcpyCall  dst_p src_p bytes (CmmLit (mkIntCLit 1))
             ]
-        emit $ mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
+        emit =<< mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
 
 emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                       -> FCode ())
@@ -875,7 +872,7 @@ doCopyMutableArrayOp = emitCopyArray copy
             getCode $ emitMemmoveCall dst_p src_p bytes (CmmLit (mkIntCLit wORD_SIZE)),
             getCode $ emitMemcpyCall  dst_p src_p bytes (CmmLit (mkIntCLit wORD_SIZE))
             ]
-        emit $ mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
+        emit =<< mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
 
 emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                   -> FCode ())
diff --git a/compiler/codeGen/StgCmmProf.hs b/compiler/codeGen/StgCmmProf.hs
index 6a53317385..9ff4d0be07 100644
--- a/compiler/codeGen/StgCmmProf.hs
+++ b/compiler/codeGen/StgCmmProf.hs
@@ -103,7 +103,7 @@ initUpdFrameProf :: CmmExpr -> FCode ()
 -- Initialise the profiling field of an update frame
 initUpdFrameProf frame_amode 
   = ifProfiling $	-- frame->header.prof.ccs = CCCS
-    emit (mkStore (cmmOffsetB frame_amode oFFSET_StgHeader_ccs) curCCS)
+    emitStore (cmmOffsetB frame_amode oFFSET_StgHeader_ccs) curCCS
 	-- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0) 
 	-- is unnecessary because it is not used anyhow.
 
@@ -143,7 +143,7 @@ saveCurrentCostCentre
   = return Nothing
   | otherwise
   = do	{ local_cc <- newTemp ccType
-	; emit (mkAssign (CmmLocal local_cc) curCCS)
+        ; emitAssign (CmmLocal local_cc) curCCS
 	; return (Just local_cc) }
 
 restoreCurrentCostCentre :: Maybe LocalReg -> FCode ()
@@ -338,9 +338,9 @@ ldvEnter cl_ptr
      -- if (era > 0) {
      --    LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) |
      --                era | LDV_STATE_USE }
-    emit (mkCmmIfThenElse (CmmMachOp mo_wordUGt [loadEra, CmmLit zeroCLit])
+    emit =<< mkCmmIfThenElse (CmmMachOp mo_wordUGt [loadEra, CmmLit zeroCLit])
 		(mkStore ldv_wd new_ldv_wd)
-		mkNop)
+                mkNop
   where
         -- don't forget to substract node's tag
     ldv_wd = ldvWord cl_ptr
diff --git a/compiler/codeGen/StgCmmTicky.hs b/compiler/codeGen/StgCmmTicky.hs
index d0432315ab..698bf32709 100644
--- a/compiler/codeGen/StgCmmTicky.hs
+++ b/compiler/codeGen/StgCmmTicky.hs
@@ -180,7 +180,7 @@ registerTickyCtr :: CLabel -> FCode ()
 --	    ticky_entry_ctrs = & (f_ct);	/* mark it as "registered" */
 --	    f_ct.registeredp = 1 }
 registerTickyCtr ctr_lbl
-  = emit (mkCmmIfThen test (catAGraphs register_stmts))
+  = emit =<< mkCmmIfThen test (catAGraphs register_stmts)
   where
     -- krc: code generator doesn't handle Not, so we test for Eq 0 instead
     test = CmmMachOp (MO_Eq wordWidth)
@@ -352,7 +352,7 @@ bumpHistogram _lbl _n
 bumpHistogramE :: LitString -> CmmExpr -> FCode ()
 bumpHistogramE lbl n 
   = do  t <- newTemp cLong
-	emit (mkAssign (CmmLocal t) n)
+        emitAssign (CmmLocal t) n
 	emit (mkCmmIfThen (CmmMachOp (MO_U_Le cLongWidth) [CmmReg (CmmLocal t), eight])
 	  		  (mkAssign (CmmLocal t) eight))
 	emit (addToMem cLong
diff --git a/compiler/codeGen/StgCmmUtils.hs b/compiler/codeGen/StgCmmUtils.hs
index bb4a653c05..273e59b0b5 100644
--- a/compiler/codeGen/StgCmmUtils.hs
+++ b/compiler/codeGen/StgCmmUtils.hs
@@ -18,12 +18,11 @@ module StgCmmUtils (
 	emitDataLits, mkDataLits,
         emitRODataLits, mkRODataLits,
         emitRtsCall, emitRtsCallWithVols, emitRtsCallWithResult, emitRtsCallGen,
-        assignTemp, newTemp, withTemp,
+        assignTemp, newTemp,
 
 	newUnboxedTupleRegs,
 
-	mkMultiAssign, mkCmmSwitch, mkCmmLitSwitch,
-	emitSwitch,
+        emitMultiAssign, emitCmmLitSwitch, emitSwitch,
 
 	tagToClosure, mkTaggedObjectLoad,
 
@@ -72,6 +71,7 @@ import Module
 import Literal
 import Digraph
 import ListSetOps
+import VarSet
 import Util
 import Unique
 import DynFlags
@@ -204,14 +204,14 @@ emitRtsCallGen
 emitRtsCallGen res pkg fun args _vols safe
   = do { updfr_off <- getUpdFrameOff
        ; emit caller_save
-       ; emit $ call updfr_off
+       ; call updfr_off
        ; emit caller_load }
   where
     call updfr_off =
       if safe then
-        mkCmmCall fun_expr res' args' updfr_off
+        emit =<< mkCmmCall fun_expr res' args' updfr_off
       else
-        mkUnsafeCall (ForeignTarget fun_expr
+        emit $ mkUnsafeCall (ForeignTarget fun_expr
                          (ForeignConvention CCallConv arg_hints res_hints)) res' args'
     (args', arg_hints) = unzip args
     (res',  res_hints) = unzip res
@@ -441,7 +441,7 @@ assignTemp :: CmmExpr -> FCode LocalReg
 assignTemp (CmmReg (CmmLocal reg)) = return reg
 assignTemp e = do { uniq <- newUnique
 		  ; let reg = LocalReg uniq (cmmExprType e)
-		  ; emit (mkAssign (CmmLocal reg) e)
+                  ; emitAssign (CmmLocal reg) e
 		  ; return reg }
 
 newTemp :: CmmType -> FCode LocalReg
@@ -471,10 +471,10 @@ newUnboxedTupleRegs res_ty
 
 
 -------------------------------------------------------------------------
---	mkMultiAssign
+--      emitMultiAssign
 -------------------------------------------------------------------------
 
-mkMultiAssign :: [LocalReg] -> [CmmExpr] -> CmmAGraph
+emitMultiAssign :: [LocalReg] -> [CmmExpr] -> FCode ()
 -- Emit code to perform the assignments in the
 -- input simultaneously, using temporary variables when necessary.
 
@@ -489,14 +489,13 @@ type Stmt = (LocalReg, CmmExpr)	-- r := e
 --		s1 assigns to something s2 uses
 --	  that is, if s1 should *follow* s2 in the final order
 
-mkMultiAssign []    []    = mkNop
-mkMultiAssign [reg] [rhs] = mkAssign (CmmLocal reg) rhs
-mkMultiAssign regs  rhss  = ASSERT( equalLength regs rhss )
-			    unscramble ([1..] `zip` (regs `zip` rhss))
+emitMultiAssign []    []    = return ()
+emitMultiAssign [reg] [rhs] = emitAssign (CmmLocal reg) rhs
+emitMultiAssign regs  rhss  = ASSERT( equalLength regs rhss )
+                              unscramble ([1..] `zip` (regs `zip` rhss))
 
-unscramble :: [Vrtx] -> CmmAGraph
-unscramble vertices
-  = catAGraphs (map do_component components)
+unscramble :: [Vrtx] -> FCode ()
+unscramble vertices = mapM_ do_component components
   where
 	edges :: [ (Vrtx, Key, [Key]) ]
 	edges = [ (vertex, key1, edges_from stmt1)
@@ -511,19 +510,19 @@ unscramble vertices
 
 	-- do_components deal with one strongly-connected component
 	-- Not cyclic, or singleton?  Just do it
-	do_component :: SCC Vrtx -> CmmAGraph
-	do_component (AcyclicSCC (_,stmt))  = mk_graph stmt
+        do_component :: SCC Vrtx -> FCode ()
+        do_component (AcyclicSCC (_,stmt))  = mk_graph stmt
 	do_component (CyclicSCC []) 	    = panic "do_component"
 	do_component (CyclicSCC [(_,stmt)]) = mk_graph stmt
 
 		-- Cyclic?  Then go via temporaries.  Pick one to
 		-- break the loop and try again with the rest.
-	do_component (CyclicSCC ((_,first_stmt) : rest))
-	  = withUnique 		$ \u -> 
+        do_component (CyclicSCC ((_,first_stmt) : rest)) = do
+            u <- newUnique
 	    let (to_tmp, from_tmp) = split u first_stmt
-	    in mk_graph to_tmp
-	       <*> unscramble rest
-	       <*> mk_graph from_tmp
+            mk_graph to_tmp
+            unscramble rest
+            mk_graph from_tmp
 
 	split :: Unique -> Stmt -> (Stmt, Stmt)
 	split uniq (reg, rhs)
@@ -532,8 +531,8 @@ unscramble vertices
 	    rep = cmmExprType rhs
 	    tmp = LocalReg uniq rep
 
-	mk_graph :: Stmt -> CmmAGraph
-	mk_graph (reg, rhs) = mkAssign (CmmLocal reg) rhs
+        mk_graph :: Stmt -> FCode ()
+        mk_graph (reg, rhs) = emitAssign (CmmLocal reg) rhs
 
 mustFollow :: Stmt -> Stmt -> Bool
 (reg, _) `mustFollow` (_, rhs) = CmmLocal reg `regUsedIn` rhs
@@ -551,7 +550,7 @@ emitSwitch :: CmmExpr  		-- Tag to switch on
 	   -> FCode ()
 emitSwitch tag_expr branches mb_deflt lo_tag hi_tag
   = do	{ dflags <- getDynFlags
-	; emit (mkCmmSwitch (via_C dflags) tag_expr branches mb_deflt lo_tag hi_tag) }
+        ; mkCmmSwitch (via_C dflags) tag_expr branches mb_deflt lo_tag hi_tag }
   where
     via_C dflags | HscC <- hscTarget dflags = True
 		 | otherwise                = False
@@ -563,38 +562,40 @@ mkCmmSwitch :: Bool			-- True <=> never generate a conditional tree
 	    -> Maybe CmmAGraph	    	-- Default branch (if any)
 	    -> ConTagZ -> ConTagZ	-- Min and Max possible values; behaviour
 	    			        -- 	outside this range is undefined
-	    -> CmmAGraph
+            -> FCode ()
 
 -- First, two rather common cases in which there is no work to do
-mkCmmSwitch _ _ []         (Just code) _ _ = code
-mkCmmSwitch _ _ [(_,code)] Nothing     _ _ = code
+mkCmmSwitch _ _ []         (Just code) _ _ = emit code
+mkCmmSwitch _ _ [(_,code)] Nothing     _ _ = emit code
 
 -- Right, off we go
-mkCmmSwitch via_C tag_expr branches mb_deflt lo_tag hi_tag
-  = withFreshLabel "switch join" 	$ \ join_lbl ->
-    label_default join_lbl mb_deflt	$ \ mb_deflt ->
-    label_branches join_lbl branches	$ \ branches ->
-    assignTemp' tag_expr		$ \tag_expr' -> 
+mkCmmSwitch via_C tag_expr branches mb_deflt lo_tag hi_tag = do
+    join_lbl      <- newLabelC
+    mb_deflt_lbl  <- label_default join_lbl mb_deflt
+    branches_lbls <- label_branches join_lbl branches
+    tag_expr'     <- assignTemp' tag_expr
     
-    mk_switch tag_expr' (sortBy (comparing fst) branches) mb_deflt 
-	      lo_tag hi_tag via_C
-	  -- Sort the branches before calling mk_switch
-    <*> mkLabel join_lbl
+    emit =<< mk_switch tag_expr' (sortBy (comparing fst) branches_lbls)
+                mb_deflt_lbl lo_tag hi_tag via_C
+
+          -- Sort the branches before calling mk_switch
+
+    emitLabel join_lbl
 
 mk_switch :: CmmExpr -> [(ConTagZ, BlockId)]
 	  -> Maybe BlockId 
 	  -> ConTagZ -> ConTagZ -> Bool
-	  -> CmmAGraph
+          -> FCode CmmAGraph
 
 -- SINGLETON TAG RANGE: no case analysis to do
 mk_switch _tag_expr [(tag, lbl)] _ lo_tag hi_tag _via_C
   | lo_tag == hi_tag
   = ASSERT( tag == lo_tag )
-    mkBranch lbl
+    return (mkBranch lbl)
 
 -- SINGLETON BRANCH, NO DEFAULT: no case analysis to do
 mk_switch _tag_expr [(_tag,lbl)] Nothing _ _ _
-  = mkBranch lbl
+  = return (mkBranch lbl)
 	-- The simplifier might have eliminated a case
 	-- 	 so we may have e.g. case xs of 
 	--				 [] -> e
@@ -603,7 +604,7 @@ mk_switch _tag_expr [(_tag,lbl)] Nothing _ _ _
 
 -- SINGLETON BRANCH: one equality check to do
 mk_switch tag_expr [(tag,lbl)] (Just deflt) _ _ _
-  = mkCbranch cond deflt lbl
+  = return (mkCbranch cond deflt lbl)
   where
     cond =  cmmNeWord tag_expr (CmmLit (mkIntCLit tag))
 	-- We have lo_tag < hi_tag, but there's only one branch, 
@@ -636,30 +637,34 @@ mk_switch tag_expr branches mb_deflt lo_tag hi_tag via_C
 	arms :: [Maybe BlockId]
 	arms = [ find_branch i | i <- [real_lo_tag..real_hi_tag]]
     in
-    mkSwitch (cmmOffset tag_expr (- real_lo_tag)) arms
+    return (mkSwitch (cmmOffset tag_expr (- real_lo_tag)) arms)
 
   -- if we can knock off a bunch of default cases with one if, then do so
   | Just deflt <- mb_deflt, (lowest_branch - lo_tag) >= n_branches
-  = mkCmmIfThenElse 
+  = do stmts <- mk_switch tag_expr branches mb_deflt
+                        lowest_branch hi_tag via_C
+       mkCmmIfThenElse
 	(cmmULtWord tag_expr (CmmLit (mkIntCLit lowest_branch)))
 	(mkBranch deflt)
-	(mk_switch tag_expr branches mb_deflt 
-			lowest_branch hi_tag via_C)
+        stmts
 
   | Just deflt <- mb_deflt, (hi_tag - highest_branch) >= n_branches
-  = mkCmmIfThenElse 
+  = do stmts <- mk_switch tag_expr branches mb_deflt
+                        lo_tag highest_branch via_C
+       mkCmmIfThenElse
 	(cmmUGtWord tag_expr (CmmLit (mkIntCLit highest_branch)))
 	(mkBranch deflt)
-	(mk_switch tag_expr branches mb_deflt 
-			lo_tag highest_branch via_C)
+        stmts
 
   | otherwise	-- Use an if-tree
-  = mkCmmIfThenElse 
+  = do lo_stmts <- mk_switch tag_expr lo_branches mb_deflt
+                             lo_tag (mid_tag-1) via_C
+       hi_stmts <- mk_switch tag_expr hi_branches mb_deflt
+                             mid_tag hi_tag via_C
+       mkCmmIfThenElse
 	(cmmUGeWord tag_expr (CmmLit (mkIntCLit mid_tag)))
-	(mk_switch tag_expr hi_branches mb_deflt 
-			     mid_tag hi_tag via_C)
-	(mk_switch tag_expr lo_branches mb_deflt 
-			     lo_tag (mid_tag-1) via_C)
+        hi_stmts
+        lo_stmts
 	-- we test (e >= mid_tag) rather than (e < mid_tag), because
 	-- the former works better when e is a comparison, and there
 	-- are two tags 0 & 1 (mid_tag == 1).  In this case, the code
@@ -714,30 +719,31 @@ mk_switch tag_expr branches mb_deflt lo_tag hi_tag via_C
     is_lo (t,_) = t < mid_tag
 
 --------------
-mkCmmLitSwitch :: CmmExpr		  -- Tag to switch on
+emitCmmLitSwitch :: CmmExpr               -- Tag to switch on
 	       -> [(Literal, CmmAGraph)]  -- Tagged branches
 	       -> CmmAGraph		  -- Default branch (always)
-	       -> CmmAGraph		  -- Emit the code
+               -> FCode ()                -- Emit the code
 -- Used for general literals, whose size might not be a word, 
 -- where there is always a default case, and where we don't know
 -- the range of values for certain.  For simplicity we always generate a tree.
 --
 -- ToDo: for integers we could do better here, perhaps by generalising
 -- mk_switch and using that.  --SDM 15/09/2004
-mkCmmLitSwitch _scrut []       deflt = deflt
-mkCmmLitSwitch scrut  branches deflt
-  = assignTemp' scrut		$ \ scrut' ->
-    withFreshLabel "switch join" 	$ \ join_lbl ->
-    label_code join_lbl deflt		$ \ deflt ->
-    label_branches join_lbl branches	$ \ branches ->
-    mk_lit_switch scrut' deflt (sortBy (comparing fst) branches)
-    <*> mkLabel join_lbl
+emitCmmLitSwitch _scrut []       deflt = emit deflt
+emitCmmLitSwitch scrut  branches deflt = do
+    scrut' <- assignTemp' scrut
+    join_lbl <- newLabelC
+    deflt_lbl <- label_code join_lbl deflt
+    branches_lbls <- label_branches join_lbl branches
+    emit =<< mk_lit_switch scrut' deflt_lbl
+               (sortBy (comparing fst) branches_lbls)
+    emitLabel join_lbl
 
 mk_lit_switch :: CmmExpr -> BlockId 
  	      -> [(Literal,BlockId)]
-	      -> CmmAGraph
+              -> FCode CmmAGraph
 mk_lit_switch scrut deflt [(lit,blk)] 
-  = mkCbranch (CmmMachOp ne [scrut, CmmLit cmm_lit]) deflt blk
+  = return (mkCbranch (CmmMachOp ne [scrut, CmmLit cmm_lit]) deflt blk)
   where
     cmm_lit = mkSimpleLit lit
     cmm_ty  = cmmLitType cmm_lit
@@ -745,9 +751,9 @@ mk_lit_switch scrut deflt [(lit,blk)]
     ne      = if isFloatType cmm_ty then MO_F_Ne rep else MO_Ne rep
 
 mk_lit_switch scrut deflt_blk_id branches
-  = mkCmmIfThenElse cond
-	(mk_lit_switch scrut deflt_blk_id lo_branches)
-	(mk_lit_switch scrut deflt_blk_id hi_branches)
+  = do lo_blk <- mk_lit_switch scrut deflt_blk_id lo_branches
+       hi_blk <- mk_lit_switch scrut deflt_blk_id hi_branches
+       mkCmmIfThenElse cond lo_blk hi_blk
   where
     n_branches = length branches
     (mid_lit,_) = branches !! (n_branches `div` 2)
@@ -761,49 +767,42 @@ mk_lit_switch scrut deflt_blk_id branches
 
 
 --------------
-label_default :: BlockId -> Maybe CmmAGraph
-	      -> (Maybe BlockId -> CmmAGraph)
-	      -> CmmAGraph
-label_default _ Nothing thing_inside 
-  = thing_inside Nothing
-label_default join_lbl (Just code) thing_inside 
-  = label_code join_lbl code 	$ \ lbl ->
-    thing_inside (Just lbl)
+label_default :: BlockId -> Maybe CmmAGraph -> FCode (Maybe BlockId)
+label_default _ Nothing
+  = return  Nothing
+label_default join_lbl (Just code)
+  = do lbl <- label_code join_lbl code
+       return (Just lbl)
 
 --------------
-label_branches :: BlockId -> [(a,CmmAGraph)]
-	       -> ([(a,BlockId)] -> CmmAGraph) 
-	       -> CmmAGraph
-label_branches _join_lbl [] thing_inside 
-  = thing_inside []
-label_branches join_lbl ((tag,code):branches) thing_inside
-  = label_code join_lbl code		$ \ lbl ->
-    label_branches join_lbl branches 	$ \ branches' ->
-    thing_inside ((tag,lbl):branches')
+label_branches :: BlockId -> [(a,CmmAGraph)] -> FCode [(a,BlockId)]
+label_branches _join_lbl []
+  = return []
+label_branches join_lbl ((tag,code):branches)
+  = do lbl <- label_code join_lbl code
+       branches' <- label_branches join_lbl branches
+       return ((tag,lbl):branches')
 
 --------------
-label_code :: BlockId -> CmmAGraph -> (BlockId -> CmmAGraph) -> CmmAGraph
--- (label_code J code fun)
+label_code :: BlockId -> CmmAGraph -> FCode BlockId
+--  label_code J code
 --	generates
---  [L: code; goto J] fun L
-label_code join_lbl code thing_inside
-  = withFreshLabel "switch" 	$ \lbl -> 
-    outOfLine (mkLabel lbl <*> code <*> mkBranch join_lbl)
-    <*> thing_inside lbl
- 
+--  [L: code; goto J]
+-- and returns L
+label_code join_lbl code = do
+    lbl <- newLabelC
+    emitOutOfLine lbl (code <*> mkBranch join_lbl)
+    return lbl
 
 --------------
-assignTemp' :: CmmExpr -> (CmmExpr -> CmmAGraph) -> CmmAGraph
-assignTemp' e thing_inside
-  | isTrivialCmmExpr e = thing_inside e
-  | otherwise          = withTemp (cmmExprType e)	$ \ lreg ->
-			 let reg = CmmLocal lreg in 
-			 mkAssign reg e <*> thing_inside (CmmReg reg)
-
-withTemp :: CmmType -> (LocalReg -> CmmAGraph) -> CmmAGraph
-withTemp rep thing_inside
-  = withUnique $ \uniq -> thing_inside (LocalReg uniq rep)
-
+assignTemp' :: CmmExpr -> FCode CmmExpr
+assignTemp' e
+  | isTrivialCmmExpr e = return e
+  | otherwise = do
+       lreg <- newTemp (cmmExprType e)
+       let reg = CmmLocal lreg
+       emitAssign reg e
+       return (CmmReg reg)
 
 -------------------------------------------------------------------------
 --
@@ -811,36 +810,13 @@ withTemp rep thing_inside
 --
 -------------------------------------------------------------------------
 
--- There is just one SRT for each top level binding; all the nested
--- bindings use sub-sections of this SRT.  The label is passed down to
--- the nested bindings via the monad.
-
-getSRTInfo :: SRT -> FCode C_SRT
-getSRTInfo (SRTEntries {}) = panic "getSRTInfo"
-
-getSRTInfo (SRT off len bmp)
-  | len > hALF_WORD_SIZE_IN_BITS || bmp == [fromIntegral srt_escape]
-  = do 	{ id <- newUnique
-	-- ; top_srt <- getSRTLabel
-        ; let srt_desc_lbl = mkLargeSRTLabel id
-        -- JD: We're not constructing and emitting SRTs in the back end,
-        -- which renders this code wrong (it now names a now-non-existent label).
-	-- ; emitRODataLits srt_desc_lbl
-        --      ( cmmLabelOffW top_srt off
-	--        : mkWordCLit (fromIntegral len)
-	--        : map mkWordCLit bmp)
-	; return (C_SRT srt_desc_lbl 0 srt_escape) }
-
-  | otherwise
-  = do	{ top_srt <- getSRTLabel
-	; return (C_SRT top_srt off (fromIntegral (head bmp))) }
-	-- The fromIntegral converts to StgHalfWord
-
-getSRTInfo NoSRT 
-  = -- TODO: Should we panic in this case?
-    -- Someone obviously thinks there should be an SRT
-    return NoC_SRT
-
+-- | Returns 'True' if there is a non-empty SRT, or 'False' otherwise
+-- NB. the SRT attached to an StgBind is still used in the new codegen
+-- to decide whether we need a static link field on a static closure
+-- or not.
+getSRTInfo :: SRT -> FCode Bool
+getSRTInfo (SRTEntries vs) = return (not (isEmptyVarSet vs))
+getSRTInfo _               = return False
 
 srt_escape :: StgHalfWord
 srt_escape = -1
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 4371bca95e..3c13bb4704 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -185,16 +185,15 @@ Library
         CmmOpt
         CmmParse
         CmmProcPoint
-        CmmSpillReload
         CmmRewriteAssignments
-        CmmStackLayout
         CmmType
         CmmUtils
+        CmmLayoutStack
         MkGraph
         OldCmm
+        OldCmmLint
         OldCmmUtils
         OldPprCmm
-        OptimizationFuel
         PprBase
         PprC
         PprCmm
@@ -440,6 +439,7 @@ Library
         Pretty
         Serialized
         State
+        Stream
         StringBuffer
         UniqFM
         UniqSet
@@ -473,6 +473,8 @@ Library
         Vectorise.Env
         Vectorise.Exp
         Vectorise
+        Hoopl.Dataflow
+        Hoopl
 
     Exposed-Modules:
             AsmCodeGen
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index f541841046..1ea6159812 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -355,7 +355,7 @@ ifeq "$(GhcProfiled)" "YES"
 # parts of the compiler of interest, and then add further cost centres
 # as necessary.  Turn on -auto-all for individual modules like this:
 
-compiler/main/DriverPipeline_HC_OPTS += -auto-all
+# compiler/main/DriverPipeline_HC_OPTS += -auto-all
 compiler/main/GhcMake_HC_OPTS        += -auto-all
 compiler/main/GHC_HC_OPTS            += -auto-all
 
diff --git a/compiler/main/CodeOutput.lhs b/compiler/main/CodeOutput.lhs
index 24906671cd..e92eb4f34c 100644
--- a/compiler/main/CodeOutput.lhs
+++ b/compiler/main/CodeOutput.lhs
@@ -15,22 +15,22 @@ import UniqSupply       ( mkSplitUniqSupply )
 
 import Finder           ( mkStubPaths )
 import PprC             ( writeCs )
-import CmmLint          ( cmmLint )
+import OldCmmLint       ( cmmLint )
 import Packages
 import OldCmm           ( RawCmmGroup )
 import HscTypes
 import DynFlags
 import Config
 import SysTools
+import Stream           (Stream)
+import qualified Stream
 
 import ErrUtils
 import Outputable
 import Module
-import Maybes           ( firstJusts )
 import SrcLoc
 
 import Control.Exception
-import Control.Monad
 import System.Directory
 import System.FilePath
 import System.IO
@@ -48,19 +48,26 @@ codeOutput :: DynFlags
            -> ModLocation
            -> ForeignStubs
            -> [PackageId]
-           -> [RawCmmGroup]                       -- Compiled C--
+           -> Stream IO RawCmmGroup ()                       -- Compiled C--
            -> IO (Bool{-stub_h_exists-}, Maybe FilePath{-stub_c_exists-})
 
-codeOutput dflags this_mod location foreign_stubs pkg_deps flat_abstractC
+codeOutput dflags this_mod location foreign_stubs pkg_deps cmm_stream
   = 
-    do  { when (dopt Opt_DoCmmLinting dflags) $ do
+    do  {
+        -- Lint each CmmGroup as it goes past
+        ; let linted_cmm_stream =
+                 if dopt Opt_DoCmmLinting dflags
+                    then Stream.mapM do_lint cmm_stream
+                    else cmm_stream
+
+              do_lint cmm = do
                 { showPass dflags "CmmLint"
-                ; let lints = map (cmmLint (targetPlatform dflags)) flat_abstractC
-                ; case firstJusts lints of
+                ; case cmmLint (targetPlatform dflags) cmm of
                         Just err -> do { log_action dflags dflags SevDump noSrcSpan defaultDumpStyle err
                                        ; ghcExit dflags 1
                                        }
                         Nothing  -> return ()
+                ; return cmm
                 }
 
         ; showPass dflags "CodeOutput"
@@ -68,9 +75,9 @@ codeOutput dflags this_mod location foreign_stubs pkg_deps flat_abstractC
         ; stubs_exist <- outputForeignStubs dflags this_mod location foreign_stubs
         ; case hscTarget dflags of {
              HscInterpreted -> return ();
-             HscAsm         -> outputAsm dflags filenm flat_abstractC;
-             HscC           -> outputC dflags filenm flat_abstractC pkg_deps;
-             HscLlvm        -> outputLlvm dflags filenm flat_abstractC;
+             HscAsm         -> outputAsm dflags filenm linted_cmm_stream;
+             HscC           -> outputC dflags filenm linted_cmm_stream pkg_deps;
+             HscLlvm        -> outputLlvm dflags filenm linted_cmm_stream;
              HscNothing     -> panic "codeOutput: HscNothing"
           }
         ; return stubs_exist
@@ -90,12 +97,16 @@ doOutput filenm io_action = bracket (openFile filenm WriteMode) hClose io_action
 \begin{code}
 outputC :: DynFlags
         -> FilePath
-        -> [RawCmmGroup]
+        -> Stream IO RawCmmGroup ()
         -> [PackageId]
         -> IO ()
 
-outputC dflags filenm flat_absC packages
+outputC dflags filenm cmm_stream packages
   = do 
+       -- ToDo: make the C backend consume the C-- incrementally, by
+       -- pushing the cmm_stream inside (c.f. nativeCodeGen)
+       rawcmms <- Stream.collect cmm_stream
+
        -- figure out which header files to #include in the generated .hc file:
        --
        --   * extra_includes from packages
@@ -117,7 +128,7 @@ outputC dflags filenm flat_absC packages
        doOutput filenm $ \ h -> do
           hPutStr h ("/* GHC_PACKAGES " ++ unwords pkg_names ++ "\n*/\n")
           hPutStr h cc_injects
-          writeCs dflags h flat_absC
+          writeCs dflags h rawcmms
 \end{code}
 
 
@@ -128,14 +139,14 @@ outputC dflags filenm flat_absC packages
 %************************************************************************
 
 \begin{code}
-outputAsm :: DynFlags -> FilePath -> [RawCmmGroup] -> IO ()
-outputAsm dflags filenm flat_absC
+outputAsm :: DynFlags -> FilePath -> Stream IO RawCmmGroup () -> IO ()
+outputAsm dflags filenm cmm_stream
  | cGhcWithNativeCodeGen == "YES"
   = do ncg_uniqs <- mkSplitUniqSupply 'n'
 
        {-# SCC "OutputAsm" #-} doOutput filenm $
            \f -> {-# SCC "NativeCodeGen" #-}
-                 nativeCodeGen dflags f ncg_uniqs flat_absC
+                 nativeCodeGen dflags f ncg_uniqs cmm_stream
 
  | otherwise
   = panic "This compiler was built without a native code generator"
@@ -149,12 +160,17 @@ outputAsm dflags filenm flat_absC
 %************************************************************************
 
 \begin{code}
-outputLlvm :: DynFlags -> FilePath -> [RawCmmGroup] -> IO ()
-outputLlvm dflags filenm flat_absC
+outputLlvm :: DynFlags -> FilePath -> Stream IO RawCmmGroup () -> IO ()
+outputLlvm dflags filenm cmm_stream
   = do ncg_uniqs <- mkSplitUniqSupply 'n'
+
+       -- ToDo: make the LLVM backend consume the C-- incrementally,
+       -- by pushing the cmm_stream inside (c.f. nativeCodeGen)
+       rawcmms <- Stream.collect cmm_stream
+
        {-# SCC "llvm_output" #-} doOutput filenm $
            \f -> {-# SCC "llvm_CodeGen" #-}
-                 llvmCodeGen dflags f ncg_uniqs flat_absC
+                 llvmCodeGen dflags f ncg_uniqs rawcmms
 \end{code}
 
 
@@ -240,4 +256,3 @@ outputForeignStubs_help fname doc_str header footer
    = do writeFile fname (header ++ doc_str ++ '\n':footer ++ "\n")
         return True
 \end{code}
-
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 53aa39f04e..60b6e82bb7 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -164,9 +164,9 @@ data DynFlag
    = Opt_D_dump_cmm
    | Opt_D_dump_raw_cmm
    | Opt_D_dump_cmmz
-   | Opt_D_dump_cmmz_pretty
    -- All of the cmmz subflags (there are a lot!)  Automatically
    -- enabled if you run -ddump-cmmz
+   | Opt_D_dump_cmmz_cfg
    | Opt_D_dump_cmmz_cbe
    | Opt_D_dump_cmmz_proc
    | Opt_D_dump_cmmz_spills
@@ -1675,7 +1675,7 @@ dynamic_flags = [
   , Flag "ddump-cmm"               (setDumpFlag Opt_D_dump_cmm)
   , Flag "ddump-raw-cmm"           (setDumpFlag Opt_D_dump_raw_cmm)
   , Flag "ddump-cmmz"              (setDumpFlag Opt_D_dump_cmmz)
-  , Flag "ddump-cmmz-pretty"       (setDumpFlag Opt_D_dump_cmmz_pretty)
+  , Flag "ddump-cmmz-cfg"          (setDumpFlag Opt_D_dump_cmmz_cbe)
   , Flag "ddump-cmmz-cbe"          (setDumpFlag Opt_D_dump_cmmz_cbe)
   , Flag "ddump-cmmz-spills"       (setDumpFlag Opt_D_dump_cmmz_spills)
   , Flag "ddump-cmmz-proc"         (setDumpFlag Opt_D_dump_cmmz_proc)
diff --git a/compiler/main/HscMain.hs b/compiler/main/HscMain.hs
index 562332d52a..0b03e83029 100644
--- a/compiler/main/HscMain.hs
+++ b/compiler/main/HscMain.hs
@@ -119,13 +119,12 @@ import TyCon
 import Name
 import SimplStg         ( stg2stg )
 import CodeGen          ( codeGen )
-import OldCmm as Old    ( CmmGroup )
-import PprCmm           ( pprCmms )
+import qualified OldCmm as Old
+import qualified Cmm as New
 import CmmParse         ( parseCmmFile )
 import CmmBuildInfoTables
 import CmmPipeline
 import CmmInfo
-import OptimizationFuel ( initOptFuelState )
 import CmmCvt
 import CodeOutput
 import NameEnv          ( emptyNameEnv )
@@ -147,6 +146,9 @@ import UniqFM           ( emptyUFM )
 import UniqSupply       ( initUs_ )
 import Bag
 import Exception
+import qualified Stream
+import Stream (Stream)
+
 import Util
 
 import Data.List
@@ -172,7 +174,6 @@ newHscEnv dflags = do
     nc_var  <- newIORef (initNameCache us knownKeyNames)
     fc_var  <- newIORef emptyUFM
     mlc_var <- newIORef emptyModuleEnv
-    optFuel <- initOptFuelState
     return HscEnv {  hsc_dflags       = dflags,
                      hsc_targets      = [],
                      hsc_mod_graph    = [],
@@ -182,7 +183,6 @@ newHscEnv dflags = do
                      hsc_NC           = nc_var,
                      hsc_FC           = fc_var,
                      hsc_MLC          = mlc_var,
-                     hsc_OptFuel      = optFuel,
                      hsc_type_env_var = Nothing }
 
 
@@ -1276,20 +1276,27 @@ hscGenHardCode cgguts mod_summary = do
                              cost_centre_info
                              stg_binds hpc_info
                     else {-# SCC "CodeGen" #-}
-                         codeGen dflags this_mod data_tycons
-                             cost_centre_info
-                             stg_binds hpc_info
+                         return (codeGen dflags this_mod data_tycons
+                               cost_centre_info
+                               stg_binds hpc_info)
+
 
         ------------------  Code output -----------------------
-        rawcmms <- {-# SCC "cmmToRawCmm" #-}
+        rawcmms0 <- {-# SCC "cmmToRawCmm" #-}
                    cmmToRawCmm platform cmms
-        dumpIfSet_dyn dflags Opt_D_dump_raw_cmm "Raw Cmm" (ppr rawcmms)
+
+        let dump a = do dumpIfSet_dyn dflags Opt_D_dump_raw_cmm "Raw Cmm"
+                           (ppr a)
+                        return a
+            rawcmms1 = Stream.mapM dump rawcmms0
+
         (_stub_h_exists, stub_c_exists)
             <- {-# SCC "codeOutput" #-}
                codeOutput dflags this_mod location foreign_stubs
-               dependencies rawcmms
+               dependencies rawcmms1
         return stub_c_exists
 
+
 hscInteractive :: (ModIface, ModDetails, CgGuts)
                -> ModSummary
                -> Hsc (InteractiveStatus, ModIface, ModDetails)
@@ -1335,7 +1342,7 @@ hscCompileCmmFile hsc_env filename = runHsc hsc_env $ do
     let dflags = hsc_dflags hsc_env
     cmm <- ioMsgMaybe $ parseCmmFile dflags filename
     liftIO $ do
-        rawCmms <- cmmToRawCmm (targetPlatform dflags) [cmm]
+        rawCmms <- cmmToRawCmm (targetPlatform dflags) (Stream.yield cmm)
         _ <- codeOutput dflags no_mod no_loc NoStubs [] rawCmms
         return ()
   where
@@ -1350,24 +1357,52 @@ tryNewCodeGen   :: HscEnv -> Module -> [TyCon]
                 -> CollectedCCs
                 -> [(StgBinding,[(Id,[Id])])]
                 -> HpcInfo
-                -> IO [Old.CmmGroup]
+                -> IO (Stream IO Old.CmmGroup ())
+         -- Note we produce a 'Stream' of CmmGroups, so that the
+         -- backend can be run incrementally.  Otherwise it generates all
+         -- the C-- up front, which has a significant space cost.
 tryNewCodeGen hsc_env this_mod data_tycons
               cost_centre_info stg_binds hpc_info = do
     let dflags = hsc_dflags hsc_env
-    prog <- StgCmm.codeGen dflags this_mod data_tycons
+
+    let cmm_stream :: Stream IO New.CmmGroup ()
+        cmm_stream = {-# SCC "StgCmm" #-}
+            StgCmm.codeGen dflags this_mod data_tycons
                            cost_centre_info stg_binds hpc_info
-    dumpIfSet_dyn dflags Opt_D_dump_cmmz "Cmm produced by new codegen"
-                  (pprCmms prog)
+
+        -- codegen consumes a stream of CmmGroup, and produces a new
+        -- stream of CmmGroup (not necessarily synchronised: one
+        -- CmmGroup on input may produce many CmmGroups on output due
+        -- to proc-point splitting).
+
+    let dump1 a = do dumpIfSet_dyn dflags Opt_D_dump_cmmz
+                       "Cmm produced by new codegen" (ppr a)
+                     return a
+
+        ppr_stream1 = Stream.mapM dump1 cmm_stream
 
     -- We are building a single SRT for the entire module, so
     -- we must thread it through all the procedures as we cps-convert them.
     us <- mkSplitUniqSupply 'S'
     let initTopSRT = initUs_ us emptySRT
-    (topSRT, prog) <- foldM (cmmPipeline hsc_env) (initTopSRT, []) prog
 
-    let prog' = map cmmOfZgraph (srtToData topSRT : prog)
-    dumpIfSet_dyn dflags Opt_D_dump_cmmz "Output Cmm" (ppr prog')
-    return prog'
+    let run_pipeline topSRT cmmgroup = do
+           (topSRT, cmmgroup) <- cmmPipeline hsc_env topSRT cmmgroup
+           return (topSRT,cmmOfZgraph cmmgroup)
+
+    let pipeline_stream = {-# SCC "cmmPipeline" #-} do
+          topSRT <- Stream.mapAccumL run_pipeline initTopSRT ppr_stream1
+          Stream.yield (cmmOfZgraph (srtToData topSRT))
+
+    let
+        dump2 a = do dumpIfSet_dyn dflags Opt_D_dump_cmmz "Output Cmm" $ ppr a
+                     return a
+
+        ppr_stream2 = Stream.mapM dump2 pipeline_stream
+
+    return ppr_stream2
+
+
 
 myCoreToStg :: DynFlags -> Module -> CoreProgram
             -> IO ( [(StgBinding,[(Id,[Id])])] -- output program
diff --git a/compiler/main/HscTypes.lhs b/compiler/main/HscTypes.lhs
index 156f081d3e..adaa9a3171 100644
--- a/compiler/main/HscTypes.lhs
+++ b/compiler/main/HscTypes.lhs
@@ -142,7 +142,6 @@ import Packages hiding  ( Version(..) )
 import DynFlags
 import DriverPhases
 import BasicTypes
-import OptimizationFuel ( OptFuelState )
 import IfaceSyn
 import CoreSyn          ( CoreRule, CoreVect )
 import Maybes
@@ -318,11 +317,6 @@ data HscEnv
                 -- ^ This caches the location of modules, so we don't have to
                 -- search the filesystem multiple times. See also 'hsc_FC'.
 
-        hsc_OptFuel :: OptFuelState,
-                -- ^ Settings to control the use of \"optimization fuel\":
-                -- by limiting the number of transformations,
-                -- we can use binary search to help find compiler bugs.
-
         hsc_type_env_var :: Maybe (Module, IORef TypeEnv)
                 -- ^ Used for one-shot compilation only, to initialise
                 -- the 'IfGblEnv'. See 'TcRnTypes.tcg_type_env_var' for
diff --git a/compiler/nativeGen/AsmCodeGen.lhs b/compiler/nativeGen/AsmCodeGen.lhs
index 51adf46005..4b49fe304e 100644
--- a/compiler/nativeGen/AsmCodeGen.lhs
+++ b/compiler/nativeGen/AsmCodeGen.lhs
@@ -71,6 +71,8 @@ import FastString
 import UniqSet
 import ErrUtils
 import Module
+import Stream (Stream)
+import qualified Stream
 
 -- DEBUGGING ONLY
 --import OrdList
@@ -147,7 +149,7 @@ data NcgImpl statics instr jumpDest = NcgImpl {
     }
 
 --------------------
-nativeCodeGen :: DynFlags -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
+nativeCodeGen :: DynFlags -> Handle -> UniqSupply -> Stream IO RawCmmGroup () -> IO ()
 nativeCodeGen dflags h us cmms
  = let platform = targetPlatform dflags
        nCG' :: (Outputable statics, Outputable instr, Instruction instr) => NcgImpl statics instr jumpDest -> IO ()
@@ -209,16 +211,16 @@ nativeCodeGen dflags h us cmms
 nativeCodeGen' :: (Outputable statics, Outputable instr, Instruction instr)
                => DynFlags
                -> NcgImpl statics instr jumpDest
-               -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
+               -> Handle -> UniqSupply -> Stream IO RawCmmGroup () -> IO ()
 nativeCodeGen' dflags ncgImpl h us cmms
  = do
         let platform = targetPlatform dflags
-            split_cmms  = concat $ map add_split cmms
+            split_cmms  = Stream.map add_split cmms
         -- BufHandle is a performance hack.  We could hide it inside
         -- Pretty if it weren't for the fact that we do lots of little
         -- printDocs here (in order to do codegen in constant space).
         bufh <- newBufHandle h
-        (imports, prof) <- cmmNativeGens dflags ncgImpl bufh us split_cmms [] [] 0
+        (imports, prof) <- cmmNativeGenStream dflags ncgImpl bufh us split_cmms [] [] 0
         bFlush bufh
 
         let (native, colorStats, linearStats)
@@ -272,6 +274,34 @@ nativeCodeGen' dflags ncgImpl h us cmms
         split_marker = CmmProc Nothing mkSplitMarkerLabel (ListGraph [])
 
 
+cmmNativeGenStream :: (Outputable statics, Outputable instr, Instruction instr)
+              => DynFlags
+              -> NcgImpl statics instr jumpDest
+              -> BufHandle
+              -> UniqSupply
+              -> Stream IO RawCmmGroup ()
+              -> [[CLabel]]
+              -> [ ([NatCmmDecl statics instr],
+                   Maybe [Color.RegAllocStats statics instr],
+                   Maybe [Linear.RegAllocStats]) ]
+              -> Int
+              -> IO ( [[CLabel]],
+                      [([NatCmmDecl statics instr],
+                      Maybe [Color.RegAllocStats statics instr],
+                      Maybe [Linear.RegAllocStats])] )
+
+cmmNativeGenStream dflags ncgImpl h us cmm_stream impAcc profAcc count
+ = do
+        r <- Stream.runStream cmm_stream
+        case r of
+          Left () -> return (reverse impAcc, reverse profAcc)
+          Right (cmms, cmm_stream') -> do
+            (impAcc,profAcc,us') <- cmmNativeGens dflags ncgImpl h us cmms
+                                              impAcc profAcc count
+            cmmNativeGenStream dflags ncgImpl h us' cmm_stream'
+                                              impAcc profAcc count
+
+
 -- | Do native code generation on all these cmms.
 --
 cmmNativeGens :: (Outputable statics, Outputable instr, Instruction instr)
@@ -287,11 +317,12 @@ cmmNativeGens :: (Outputable statics, Outputable instr, Instruction instr)
               -> Int
               -> IO ( [[CLabel]],
                       [([NatCmmDecl statics instr],
-                      Maybe [Color.RegAllocStats statics instr],
-                      Maybe [Linear.RegAllocStats])] )
+                       Maybe [Color.RegAllocStats statics instr],
+                       Maybe [Linear.RegAllocStats])],
+                      UniqSupply )
 
-cmmNativeGens _ _ _ _ [] impAcc profAcc _
-        = return (reverse impAcc, reverse profAcc)
+cmmNativeGens _ _ _ us [] impAcc profAcc _
+        = return (impAcc,profAcc,us)
 
 cmmNativeGens dflags ncgImpl h us (cmm : cmms) impAcc profAcc count
  = do
@@ -817,7 +848,11 @@ Ideas for other things we could do (put these in Hoopl please!):
 cmmToCmm :: DynFlags -> RawCmmDecl -> (RawCmmDecl, [CLabel])
 cmmToCmm _ top@(CmmData _ _) = (top, [])
 cmmToCmm dflags (CmmProc info lbl (ListGraph blocks)) = runCmmOpt dflags $ do
-  blocks' <- mapM cmmBlockConFold (cmmMiniInline dflags (cmmEliminateDeadBlocks blocks))
+  let reachable_blocks | dopt Opt_TryNewCodeGen dflags = blocks
+                       | otherwise = cmmEliminateDeadBlocks blocks
+      -- The new codegen path has already eliminated unreachable blocks by now
+
+  blocks' <- mapM cmmBlockConFold (cmmMiniInline dflags reachable_blocks)
   return $ CmmProc info lbl (ListGraph blocks')
 
 newtype CmmOptM a = CmmOptM (([CLabel], DynFlags) -> (# a, [CLabel] #))
@@ -911,7 +946,8 @@ cmmExprConFold referenceKind expr = do
     dflags <- getDynFlags
     -- Skip constant folding if new code generator is running
     -- (this optimization is done in Hoopl)
-    let expr' = if dopt Opt_TryNewCodeGen dflags
+    -- SDM: re-enabled for now, while cmmRewriteAssignments is turned off
+    let expr' = if False -- dopt Opt_TryNewCodeGen dflags
                     then expr
                     else cmmExprCon (targetPlatform dflags) expr
     cmmExprNative referenceKind expr'
diff --git a/compiler/simplStg/SimplStg.lhs b/compiler/simplStg/SimplStg.lhs
index d5024ab2e0..635df3ce41 100644
--- a/compiler/simplStg/SimplStg.lhs
+++ b/compiler/simplStg/SimplStg.lhs
@@ -55,7 +55,10 @@ stg2stg dflags module_name binds
 	; (processed_binds, _, cost_centres) 
 		<- foldl_mn do_stg_pass (binds', us0, ccs) (getStgToDo dflags)
 
-	; let srt_binds = computeSRTs (unarise us1 processed_binds)
+        ; let un_binds = unarise us1 processed_binds
+        ; let srt_binds
+               | dopt Opt_TryNewCodeGen dflags = zip un_binds (repeat [])
+               | otherwise = computeSRTs un_binds
 
 	; dumpIfSet_dyn dflags Opt_D_dump_stg "STG syntax:" 
 	     		(pprStgBindingsWithSRTs srt_binds)
diff --git a/compiler/utils/OrdList.lhs b/compiler/utils/OrdList.lhs
index a56cdf3f58..7e3b24a5da 100644
--- a/compiler/utils/OrdList.lhs
+++ b/compiler/utils/OrdList.lhs
@@ -27,11 +27,14 @@ infixl 5  `snocOL`
 infixr 5  `consOL`
 
 data OrdList a
-  = Many [a]        -- Invariant: non-empty
+  = None
+  | One a
+  | Many [a]          -- Invariant: non-empty
+  | Cons a (OrdList a)
+  | Snoc (OrdList a) a
   | Two (OrdList a) -- Invariant: non-empty
         (OrdList a) -- Invariant: non-empty
-  | One  a
-  | None
+
 
 nilOL    :: OrdList a
 isNilOL  :: OrdList a -> Bool
@@ -44,22 +47,33 @@ concatOL :: [OrdList a] -> OrdList a
 
 nilOL        = None
 unitOL as    = One as
-snocOL None b    = One b
-snocOL as   b    = Two as (One b)
-consOL a    None = One a
-consOL a    bs   = Two (One a) bs
+snocOL as   b    = Snoc as b
+consOL a    bs   = Cons a bs
 concatOL aas = foldr appOL None aas
 
 isNilOL None = True
 isNilOL _    = False
 
-appOL None bs   = bs
-appOL as   None = as
-appOL as   bs   = Two as bs
+None  `appOL` b     = b
+a     `appOL` None  = a
+One a `appOL` b     = Cons a b
+a     `appOL` One b = Snoc a b
+a     `appOL` b     = Two a b
+
+fromOL :: OrdList a -> [a]
+fromOL a = go a []
+  where go None       acc = acc
+        go (One a)    acc = a : acc
+        go (Cons a b) acc = a : go b acc
+        go (Snoc a b) acc = go a (b:acc)
+        go (Two a b)  acc = go a (go b acc)
+        go (Many xs)  acc = xs ++ acc
 
 mapOL :: (a -> b) -> OrdList a -> OrdList b
 mapOL _ None = None
 mapOL f (One x) = One (f x)
+mapOL f (Cons x xs) = Cons (f x) (mapOL f xs)
+mapOL f (Snoc xs x) = Snoc (mapOL f xs) (f x)
 mapOL f (Two x y) = Two (mapOL f x) (mapOL f y)
 mapOL f (Many xs) = Many (map f xs)
 
@@ -69,24 +83,19 @@ instance Functor OrdList where
 foldrOL :: (a->b->b) -> b -> OrdList a -> b
 foldrOL _ z None        = z
 foldrOL k z (One x)     = k x z
+foldrOL k z (Cons x xs) = k x (foldrOL k z xs)
+foldrOL k z (Snoc xs x) = foldrOL k (k x z) xs
 foldrOL k z (Two b1 b2) = foldrOL k (foldrOL k z b2) b1
 foldrOL k z (Many xs)   = foldr k z xs
 
 foldlOL :: (b->a->b) -> b -> OrdList a -> b
 foldlOL _ z None        = z
 foldlOL k z (One x)     = k z x
+foldlOL k z (Cons x xs) = foldlOL k (k z x) xs
+foldlOL k z (Snoc xs x) = k (foldlOL k z xs) x
 foldlOL k z (Two b1 b2) = foldlOL k (foldlOL k z b1) b2
 foldlOL k z (Many xs)   = foldl k z xs
 
-fromOL :: OrdList a -> [a]
-fromOL ol 
-   = flat ol []
-     where
-        flat None      rest = rest
-        flat (One x)   rest = x:rest
-        flat (Two a b) rest = flat a (flat b rest)
-        flat (Many xs) rest = xs ++ rest
-
 toOL :: [a] -> OrdList a
 toOL [] = None
 toOL xs = Many xs
diff --git a/compiler/utils/Stream.hs b/compiler/utils/Stream.hs
new file mode 100644
index 0000000000..2fa76d2345
--- /dev/null
+++ b/compiler/utils/Stream.hs
@@ -0,0 +1,97 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2012
+--
+-- Monadic streams
+--
+-- -----------------------------------------------------------------------------
+
+module Stream (
+    Stream(..), yield, liftIO,
+    collect, fromList,
+    Stream.map, Stream.mapM, Stream.mapAccumL
+  ) where
+
+-- |
+-- @Stream m a b@ is a computation in some Monad @m@ that delivers a sequence
+-- of elements of type @a@ followed by a result of type @b@.
+--
+-- More concretely, a value of type @Stream m a b@ can be run using @runStream@
+-- in the Monad @m@, and it delivers either
+--
+--  * the final result: @Left b@, or
+--  * @Right (a,str)@, where @a@ is the next element in the stream, and @str@
+--    is a computation to get the rest of the stream.
+--
+-- Stream is itself a Monad, and provides an operation 'yield' that
+-- produces a new element of the stream.  This makes it convenient to turn
+-- existing monadic computations into streams.
+--
+-- The idea is that Stream is useful for making a monadic computation
+-- that produces values from time to time.  This can be used for
+-- knitting together two complex monadic operations, so that the
+-- producer does not have to produce all its values before the
+-- consumer starts consuming them.  We make the producer into a
+-- Stream, and the consumer pulls on the stream each time it wants a
+-- new value.
+--
+newtype Stream m a b = Stream { runStream :: m (Either b (a, Stream m a b)) }
+
+instance Monad m => Monad (Stream m a) where
+  return a = Stream (return (Left a))
+
+  Stream m >>= k = Stream $ do
+                r <- m
+                case r of
+                  Left b        -> runStream (k b)
+                  Right (a,str) -> return (Right (a, str >>= k))
+
+yield :: Monad m => a -> Stream m a ()
+yield a = Stream (return (Right (a, return ())))
+
+liftIO :: IO a -> Stream IO b a
+liftIO io = Stream $ io >>= return . Left
+
+-- | Turn a Stream into an ordinary list, by demanding all the elements.
+collect :: Monad m => Stream m a () -> m [a]
+collect str = go str []
+ where
+  go str acc = do
+    r <- runStream str
+    case r of
+      Left () -> return (reverse acc)
+      Right (a, str') -> go str' (a:acc)
+
+-- | Turn a list into a 'Stream', by yielding each element in turn.
+fromList :: Monad m => [a] -> Stream m a ()
+fromList = mapM_ yield
+
+-- | Apply a function to each element of a 'Stream', lazilly
+map :: Monad m => (a -> b) -> Stream m a x -> Stream m b x
+map f str = Stream $ do
+   r <- runStream str
+   case r of
+     Left x -> return (Left x)
+     Right (a, str') -> return (Right (f a, Stream.map f str'))
+
+-- | Apply a monadic operation to each element of a 'Stream', lazilly
+mapM :: Monad m => (a -> m b) -> Stream m a x -> Stream m b x
+mapM f str = Stream $ do
+   r <- runStream str
+   case r of
+     Left x -> return (Left x)
+     Right (a, str') -> do
+        b <- f a
+        return (Right (b, Stream.mapM f str'))
+
+-- | analog of the list-based 'mapAccumL' on Streams.  This is a simple
+-- way to map over a Stream while carrying some state around.
+mapAccumL :: Monad m => (c -> a -> m (c,b)) -> c -> Stream m a ()
+          -> Stream m b c
+mapAccumL f c str = Stream $ do
+  r <- runStream str
+  case r of
+    Left  () -> return (Left c)
+    Right (a, str') -> do
+      (c',b) <- f c a
+      return (Right (b, mapAccumL f c' str'))