Merge branch 'master' of http://darcs.haskell.org/ghc

author: Simon Peyton Jones <simonpj@microsoft.com> 2012-07-10 16:18:54 +0100
committer: Simon Peyton Jones <simonpj@microsoft.com> 2012-07-10 16:18:54 +0100
commit: b8bfab8076f1d8fd5ed1d634fee82fd4d2fc0ed8 (patch)
tree: e736c4fd8a77773e081e3987d4c17bb2a9e7c98e /compiler/cmm
parent: c548f91feddf149ee4d3358483828f2d4c0ec41b (diff)
parent: 713cf473de8a2ad7d0b8195d78860c25fec41839 (diff)
download: haskell-b8bfab8076f1d8fd5ed1d634fee82fd4d2fc0ed8.tar.gz
32 files changed, 3432 insertions, 2916 deletions
diff --git a/compiler/cmm/BlockId.hs b/compiler/cmm/BlockId.hs
index feeacb553d..95293c850b 100644
--- a/compiler/cmm/BlockId.hs
+++ b/compiler/cmm/BlockId.hs
@@ -15,7 +15,7 @@ import Outputable
 import Unique
 
 import Compiler.Hoopl as Hoopl hiding (Unique)
-import Compiler.Hoopl.GHC (uniqueToInt, uniqueToLbl, lblToUnique)
+import Compiler.Hoopl.Internals (uniqueToLbl, lblToUnique)
 
 ----------------------------------------------------------------
 --- Block Ids, their environments, and their sets
@@ -32,14 +32,14 @@ compilation unit in which it appears.
 type BlockId = Hoopl.Label
 
 instance Uniquable BlockId where
-  getUnique label = getUnique (uniqueToInt $ lblToUnique label)
-
-mkBlockId :: Unique -> BlockId
-mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
+  getUnique label = getUnique (lblToUnique label)
 
 instance Outputable BlockId where
   ppr label = ppr (getUnique label)
 
+mkBlockId :: Unique -> BlockId
+mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
+
 retPtLbl :: BlockId -> CLabel
 retPtLbl label = mkReturnPtLabel $ getUnique label
 
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index f1318c1dc9..1c77409e49 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -32,9 +32,9 @@ module Cmm (
 import CLabel
 import BlockId
 import CmmNode
-import OptimizationFuel as F
 import SMRep
 import CmmExpr
+import UniqSupply
 import Compiler.Hoopl
 
 import Data.Word        ( Word8 )
@@ -69,8 +69,6 @@ type CmmGroup = GenCmmGroup CmmStatics CmmTopInfo CmmGraph
 --   (a) C--, i.e. populated with various C-- constructs
 --       (Cmm and RawCmm in OldCmm.hs)
 --   (b) Native code, populated with data/instructions
---
--- A second family of instances based on Hoopl is in Cmm.hs.
 
 -- | A top-level chunk, abstracted over the type of the contents of
 -- the basic blocks (Cmm or instructions are the likely instantiations).
@@ -95,19 +93,23 @@ data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
 type CmmBlock = Block CmmNode C C
 
 type CmmReplGraph e x = GenCmmReplGraph CmmNode e x
-type GenCmmReplGraph n e x = FuelUniqSM (Maybe (Graph n e x))
-type CmmFwdRewrite f = FwdRewrite FuelUniqSM CmmNode f
-type CmmBwdRewrite f = BwdRewrite FuelUniqSM CmmNode f
+type GenCmmReplGraph n e x = UniqSM (Maybe (Graph n e x))
+type CmmFwdRewrite f = FwdRewrite UniqSM CmmNode f
+type CmmBwdRewrite f = BwdRewrite UniqSM CmmNode f
 
 -----------------------------------------------------------------------------
 --     Info Tables
 -----------------------------------------------------------------------------
 
-data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
+data CmmTopInfo   = TopInfo { info_tbl :: CmmInfoTable
+                            , stack_info :: CmmStackInfo }
 
 data CmmStackInfo
    = StackInfo {
-       arg_space :: ByteOff,            -- XXX: comment?
+       arg_space :: ByteOff,
+               -- number of bytes of arguments on the stack on entry to the
+               -- the proc.  This is filled in by StgCmm.codeGen, and used
+               -- by the stack allocator later.
        updfr_space :: Maybe ByteOff     -- XXX: comment?
    }
 
diff --git a/compiler/cmm/CmmBuildInfoTables.hs b/compiler/cmm/CmmBuildInfoTables.hs
index ab829de499..ebe755219b 100644
--- a/compiler/cmm/CmmBuildInfoTables.hs
+++ b/compiler/cmm/CmmBuildInfoTables.hs
@@ -14,169 +14,53 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
 module CmmBuildInfoTables
     ( CAFSet, CAFEnv, cafAnal, localCAFInfo, mkTopCAFInfo
-    , setInfoTableSRT, setInfoTableStackMap
+    , setInfoTableSRT
     , TopSRT, emptySRT, srtToData
     , bundleCAFs
-    , lowerSafeForeignCalls
-    , cafTransfers, liveSlotTransfers
-    , mkLiveness )
+    , cafTransfers )
 where
 
 #include "HsVersions.h"
 
 -- These should not be imported here!
-import StgCmmForeign
 import StgCmmUtils
 
-import Constants
 import Digraph
 import qualified Prelude as P
 import Prelude hiding (succ)
-import Util
 
 import BlockId
 import Bitmap
 import CLabel
 import Cmm
 import CmmUtils
-import CmmStackLayout
-import Module
-import FastString
-import ForeignCall
 import IdInfo
 import Data.List
 import Maybes
-import MkGraph as M
-import Control.Monad
 import Name
-import OptimizationFuel
 import Outputable
 import SMRep
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import Data.Map (Map)
 import qualified Data.Map as Map
-import qualified FiniteMap as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+foldSet :: (a -> b -> b) -> b -> Set a -> b
+#if __GLASGOW_HASKELL__ < 704
+foldSet = Set.fold
+#else
+foldSet = Set.foldr
+#endif
 
 ----------------------------------------------------------------
 -- Building InfoTables
 
 
 -----------------------------------------------------------------------
--- Stack Maps
-
--- Given a block ID, we return a representation of the layout of the stack,
--- as suspended before entering that block.
--- (For a return site to a function call, the layout does not include the
---  parameter passing area (or the "return address" on the stack)).
--- If the element is `Nothing`, then it represents a word of the stack that
--- does not contain a live pointer.
--- If the element is `Just` a register, then it represents a live spill slot
--- for a pointer; we assume that a pointer is the size of a word.
--- The head of the list represents the young end of the stack where the infotable
--- pointer for the block `Bid` is stored.
--- The infotable pointer itself is not included in the list.
--- Call areas are also excluded from the list: besides the stuff in the update
--- frame (and the return infotable), call areas should never be live across
--- function calls.
-
--- RTS Invariant: All pointers must be word-aligned because each bit in the bitmap
--- represents a word. Consequently, we have to be careful when we see a live slot
--- on the stack: if we have packed multiple sub-word values into a word,
--- we have to make sure that we only mark the entire word as a non-pointer.
-
--- Also, don't forget to stop at the old end of the stack (oldByte),
--- which may differ depending on whether there is an update frame.
-
-type RegSlotInfo
-   = ( Int	  -- Offset from oldest byte of Old area
-     , LocalReg   -- The register
-     , Int)       -- Width of the register
-
-live_ptrs :: ByteOff -> BlockEnv SubAreaSet -> AreaMap -> BlockId -> StackLayout
-live_ptrs oldByte slotEnv areaMap bid =
-  -- pprTrace "live_ptrs for" (ppr bid <+> text (show oldByte ++ "-" ++ show youngByte) <+>
-  --                           ppr liveSlots) $
-  -- pprTrace ("stack layout for " ++ show bid ++ ": ") (ppr res) $ res
-  res
-  where 
-        res = mkLiveness (reverse $ slotsToList youngByte liveSlots [])
- 
-        slotsToList :: Int -> [RegSlotInfo] -> [Maybe LocalReg] -> [Maybe LocalReg]
-        -- n starts at youngByte and is decremented down to oldByte
-	-- Returns a list, one element per word, with 
-	--    (Just r) meaning 'pointer register r is saved here', 
-	--    Nothing  meaning 'non-pointer or empty'
-
-        slotsToList n [] results | n == oldByte = results -- at old end of stack frame
-
-        slotsToList n (s : _) _  | n == oldByte =
-          pprPanic "slot left off live_ptrs" (ppr s <+> ppr oldByte <+>
-               ppr n <+> ppr liveSlots <+> ppr youngByte)
-
-        slotsToList n _ _ | n < oldByte =
-          panic "stack slots not allocated on word boundaries?"
-
-        slotsToList n l@((n', r, w) : rst) results =
-          if n == (n' + w) then -- slot's young byte is at n
-            ASSERT (not (isPtr r) ||
-                    (n `mod` wORD_SIZE == 0 && w == wORD_SIZE)) -- ptrs must be aligned
-            slotsToList next (dropWhile (non_ptr_younger_than next) rst)
-                        (stack_rep : results)
-          else slotsToList next (dropWhile (non_ptr_younger_than next) l)
-                           (Nothing : results)
-          where next = n - wORD_SIZE
-                stack_rep = if isPtr r then Just r else Nothing
-
-        slotsToList n [] results = slotsToList (n - wORD_SIZE) [] (Nothing : results)
-
-        non_ptr_younger_than next (n', r, w) =
-          n' + w > next &&
-            ASSERT (not (isPtr r))
-            True
-        isPtr = isGcPtrType . localRegType
-
-        liveSlots :: [RegSlotInfo]
-        liveSlots = sortBy (\ (off,_,_) (off',_,_) -> compare off' off)
-                           (Map.foldRightWithKey (\_ -> flip $ foldl add_slot) [] slots)
-                    
-        add_slot :: [RegSlotInfo] -> SubArea -> [RegSlotInfo]
-        add_slot rst (a@(RegSlot r@(LocalReg _ ty)), off, w) = 
-          if off == w && widthInBytes (typeWidth ty) == w then
-            (expectJust "add_slot" (Map.lookup a areaMap), r, w) : rst
-          else panic "live_ptrs: only part of a variable live at a proc point"
-        add_slot rst (CallArea Old, _, _) =
-          rst -- the update frame (or return infotable) should be live
-              -- would be nice to check that only that part of the callarea is live...
-        add_slot rst ((CallArea _), _, _) =
-          rst
-          -- JD: THIS ISN'T CURRENTLY A CORRECTNESS PROBLEM, BUT WE SHOULD REALLY
-          -- MAKE LIVENESS INFO AROUND CALLS MORE PRECISE -- FOR NOW, A 32-BIT
-          -- FLOAT PADS OUT TO 64 BITS, BUT WE ASSUME THE WHOLE PARAMETER-PASSING
-          -- AREA IS LIVE (WHICH IT ISN'T...).  WE SHOULD JUST PUT THE LIVE AREAS
-          -- IN THE CALL NODES, WHICH SHOULD EVENTUALLY HAVE LIVE REGISTER AS WELL,
-          -- SO IT'S ALL GOING IN THE SAME DIRECTION.
-          -- pprPanic "CallAreas must not be live across function calls" (ppr bid <+> ppr c)
-
-        slots :: SubAreaSet	 -- The SubAreaSet for 'bid'
-        slots = expectJust "live_ptrs slots" $ mapLookup bid slotEnv
-        youngByte = expectJust "live_ptrs bid_pos" $ Map.lookup (CallArea (Young bid)) areaMap
-
--- Construct the stack maps for a procedure _if_ it needs an infotable.
--- When wouldn't a procedure need an infotable? If it is a procpoint that
--- is not the successor of a call.
-setInfoTableStackMap :: SlotEnv -> AreaMap -> CmmDecl -> CmmDecl
-setInfoTableStackMap slotEnv areaMap
-     t@(CmmProc (TopInfo {stack_info=StackInfo {updfr_space = Just updfr_off}}) _ 
-                (CmmGraph {g_entry = eid}))
-  = updInfo (const (live_ptrs updfr_off slotEnv areaMap eid)) id t
-setInfoTableStackMap _ _ t = t
-                 
-
-
------------------------------------------------------------------------
 -- SRTs
 
 -- WE NEED AN EXAMPLE HERE.
@@ -191,14 +75,14 @@ setInfoTableStackMap _ _ t = t
 -----------------------------------------------------------------------
 -- Finding the CAFs used by a procedure
 
-type CAFSet = Map CLabel ()
+type CAFSet = Set CLabel
 type CAFEnv = BlockEnv CAFSet
 
 -- First, an analysis to find live CAFs.
 cafLattice :: DataflowLattice CAFSet
-cafLattice = DataflowLattice "live cafs" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case old `Map.union` new of
-                                              new' -> (changeIf $ Map.size new' > Map.size old, new')
+cafLattice = DataflowLattice "live cafs" Set.empty add
+  where add _ (OldFact old) (NewFact new) = case old `Set.union` new of
+                                              new' -> (changeIf $ Set.size new' > Set.size old, new')
 
 cafTransfers :: BwdTransfer CmmNode CAFSet
 cafTransfers = mkBTransfer3 first middle last
@@ -210,11 +94,11 @@ cafTransfers = mkBTransfer3 first middle last
                CmmLit (CmmLabelOff c _)         -> add c set
                CmmLit (CmmLabelDiffOff c1 c2 _) -> add c1 $ add c2 set
                _ -> set
-        add l s = if hasCAF l then Map.insert (toClosureLbl l) () s
+        add l s = if hasCAF l then Set.insert (toClosureLbl l) s
                               else s
 
-cafAnal :: CmmGraph -> FuelUniqSM CAFEnv
-cafAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd cafLattice cafTransfers
+cafAnal :: CmmGraph -> CAFEnv
+cafAnal g = dataflowAnalBwd g [] $ analBwd cafLattice cafTransfers
 
 -----------------------------------------------------------------------
 -- Building the SRTs
@@ -264,15 +148,15 @@ srtToData srt = [CmmData RelocatableReadOnlyData (Statics (lbl srt) tbl)]
 -- we make sure they're all close enough to the bottom of the table that the
 -- bitmap will be able to cover all of them.
 buildSRTs :: TopSRT -> Map CLabel CAFSet -> CAFSet ->
-             FuelUniqSM (TopSRT, Maybe CmmDecl, C_SRT)
+             UniqSM (TopSRT, Maybe CmmDecl, C_SRT)
 buildSRTs topSRT topCAFMap cafs =
-  do let liftCAF lbl () z = -- get CAFs for functions without static closures
-           case Map.lookup lbl topCAFMap of Just cafs -> z `Map.union` cafs
-                                            Nothing   -> Map.insert lbl () z
+  do let liftCAF lbl z = -- get CAFs for functions without static closures
+           case Map.lookup lbl topCAFMap of Just cafs -> z `Set.union` cafs
+                                            Nothing   -> Set.insert lbl z
          -- For each label referring to a function f without a static closure,
          -- replace it with the CAFs that are reachable from f.
          sub_srt topSRT localCafs =
-           let cafs = Map.keys (Map.foldRightWithKey liftCAF Map.empty localCafs)
+           let cafs = Set.elems (foldSet liftCAF Set.empty localCafs)
                mkSRT topSRT =
                  do localSRTs <- procpointSRT (lbl topSRT) (elt_map topSRT) cafs
                     return (topSRT, localSRTs)
@@ -307,7 +191,7 @@ buildSRTs topSRT topCAFMap cafs =
 -- Construct an SRT bitmap.
 -- Adapted from simpleStg/SRT.lhs, which expects Id's.
 procpointSRT :: CLabel -> Map CLabel Int -> [CLabel] ->
-                FuelUniqSM (Maybe CmmDecl, C_SRT)
+                UniqSM (Maybe CmmDecl, C_SRT)
 procpointSRT _ _ [] =
  return (Nothing, NoC_SRT)
 procpointSRT top_srt top_table entries =
@@ -315,7 +199,7 @@ procpointSRT top_srt top_table entries =
     return (top, srt)
   where
     ints = map (expectJust "constructSRT" . flip Map.lookup top_table) entries
-    sorted_ints = sortLe (<=) ints
+    sorted_ints = sort ints
     offset = head sorted_ints
     bitmap_entries = map (subtract offset) sorted_ints
     len = P.last bitmap_entries + 1
@@ -325,7 +209,7 @@ maxBmpSize :: Int
 maxBmpSize = widthInBits wordWidth `div` 2
 
 -- Adapted from codeGen/StgCmmUtils, which converts from SRT to C_SRT.
-to_SRT :: CLabel -> Int -> Int -> Bitmap -> FuelUniqSM (Maybe CmmDecl, C_SRT)
+to_SRT :: CLabel -> Int -> Int -> Bitmap -> UniqSM (Maybe CmmDecl, C_SRT)
 to_SRT top_srt off len bmp
   | len > maxBmpSize || bmp == [fromIntegral srt_escape]
   = do id <- getUniqueM
@@ -373,30 +257,30 @@ mkTopCAFInfo localCAFs = foldl addToTop Map.empty g
           Map.insert l (flatten env cafset) env
         addToTop env (CyclicSCC nodes) =
           let (lbls, cafsets) = unzip nodes
-              cafset  = lbls `Map.deleteList` foldl Map.union Map.empty cafsets
+              cafset  = foldr Set.delete (foldl Set.union Set.empty cafsets) lbls
           in foldl (\env l -> Map.insert l (flatten env cafset) env) env lbls
-        flatten env cafset = Map.foldRightWithKey (lookup env) Map.empty cafset
-        lookup env caf () cafset' =
-          case Map.lookup caf env of Just cafs -> Map.foldRightWithKey add cafset' cafs
-                                     Nothing -> add caf () cafset'
-        add caf () cafset' = Map.insert caf () cafset'
+        flatten env cafset = foldSet (lookup env) Set.empty cafset
+        lookup env caf cafset' =
+          case Map.lookup caf env of Just cafs -> foldSet add cafset' cafs
+                                     Nothing -> add caf cafset'
+        add caf cafset' = Set.insert caf cafset'
         g = stronglyConnCompFromEdgedVertices
-              (map (\n@(l, cafs) -> (n, l, Map.keys cafs)) localCAFs)
+              (map (\n@(l, cafs) -> (n, l, Set.elems cafs)) localCAFs)
 
 -- Bundle the CAFs used at a procpoint.
 bundleCAFs :: CAFEnv -> CmmDecl -> (CAFSet, CmmDecl)
 bundleCAFs cafEnv t@(CmmProc _ _ (CmmGraph {g_entry=entry})) =
   (expectJust "bundleCAFs" (mapLookup entry cafEnv), t)
-bundleCAFs _ t = (Map.empty, t)
+bundleCAFs _ t = (Set.empty, t)
 
 -- Construct the SRTs for the given procedure.
 setInfoTableSRT :: Map CLabel CAFSet -> TopSRT -> (CAFSet, CmmDecl) ->
-                   FuelUniqSM (TopSRT, [CmmDecl])
+                   UniqSM (TopSRT, [CmmDecl])
 setInfoTableSRT topCAFMap topSRT (cafs, t) =
   setSRT cafs topCAFMap topSRT t
 
 setSRT :: CAFSet -> Map CLabel CAFSet -> TopSRT ->
-          CmmDecl -> FuelUniqSM (TopSRT, [CmmDecl])
+          CmmDecl -> UniqSM (TopSRT, [CmmDecl])
 setSRT cafs topCAFMap topSRT t =
   do (topSRT, cafTable, srt) <- buildSRTs topSRT topCAFMap cafs
      let t' = updInfo id (const srt) t
@@ -418,91 +302,3 @@ updInfoTbl toVars toSrt info_tbl@(CmmInfoTable {})
                            StackRep ls -> StackRep (toVars ls)
                            other       -> other }
 updInfoTbl _ _ t@CmmNonInfoTable = t
-  
-----------------------------------------------------------------
--- Safe foreign calls: We need to insert the code that suspends and resumes
--- the thread before and after a safe foreign call.
--- Why do we do this so late in the pipeline?
--- Because we need this code to appear without interrruption: you can't rely on the
--- value of the stack pointer between the call and resetting the thread state;
--- you need to have an infotable on the young end of the stack both when
--- suspending the thread and making the foreign call.
--- All of this is much easier if we insert the suspend and resume calls here.
-
--- At the same time, we prepare for the stages of the compiler that
--- build the proc points. We have to do this at the same time because
--- the safe foreign calls need special treatment with respect to infotables.
--- A safe foreign call needs an infotable even though it isn't
--- a procpoint. The following datatype captures the information
--- needed to generate the infotables along with the Cmm data and procedures.
-
--- JD: Why not do this while splitting procedures?
-lowerSafeForeignCalls :: AreaMap -> CmmDecl -> FuelUniqSM CmmDecl
-lowerSafeForeignCalls _ t@(CmmData _ _) = return t
-lowerSafeForeignCalls areaMap (CmmProc info l g@(CmmGraph {g_entry=entry})) = do
-  let block b mblocks = mblocks >>= lowerSafeCallBlock entry areaMap b
-  blocks <- foldGraphBlocks block (return mapEmpty) g
-  return $ CmmProc info l (ofBlockMap entry blocks)
-
--- If the block ends with a safe call in the block, lower it to an unsafe
--- call (with appropriate saves and restores before and after).
-lowerSafeCallBlock :: BlockId -> AreaMap -> CmmBlock -> BlockEnv CmmBlock
-                              -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeCallBlock entry areaMap b blocks =
-  case blockToNodeList b of
-    (JustC (CmmEntry id), m, JustC l@(CmmForeignCall {})) -> lowerSafeForeignCall entry areaMap blocks id m l
-    _                                                    -> return $ insertBlock b blocks
-
--- Late in the code generator, we want to insert the code necessary
--- to lower a safe foreign call to a sequence of unsafe calls.
-lowerSafeForeignCall :: BlockId -> AreaMap -> BlockEnv CmmBlock -> BlockId -> [CmmNode O O] -> CmmNode O C
-                                -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeForeignCall entry areaMap blocks bid m
-    (CmmForeignCall {tgt=tgt, res=rs, args=as, succ=succ, updfr = updfr_off, intrbl = intrbl}) =
- do let newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
-    -- Both 'id' and 'new_base' are KindNonPtr because they're
-    -- RTS-only objects and are not subject to garbage collection
-    id <- newTemp bWord
-    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
-    let (caller_save, caller_load) = callerSaveVolatileRegs
-    load_tso <- newTemp gcWord -- TODO FIXME NOW
-    load_stack <- newTemp gcWord -- TODO FIXME NOW
-    let (<**>) = (M.<*>)
-    let suspendThread = foreignLbl "suspendThread"
-        resumeThread  = foreignLbl "resumeThread"
-        foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit name)))
-        suspend = saveThreadState <**>
-                  caller_save <**>
-                  mkUnsafeCall (ForeignTarget suspendThread
-                                (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
-                               [id] [CmmReg (CmmGlobal BaseReg), CmmLit (CmmInt (fromIntegral (fromEnum intrbl)) wordWidth)]
-        midCall = mkUnsafeCall tgt rs as
-        resume  = mkUnsafeCall (ForeignTarget resumeThread
-                                (ForeignConvention CCallConv [AddrHint] [AddrHint]))
-                     [new_base] [CmmReg (CmmLocal id)] <**>
-                  -- Assign the result to BaseReg: we
-                  -- might now have a different Capability!
-                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <**>
-                  caller_load <**>
-                  loadThreadState load_tso load_stack
-        -- We have to save the return value on the stack because its next use
-        -- may appear in a different procedure due to procpoint splitting...
-        saveRetVals = foldl (<**>) emptyAGraph $ map (M.mkMiddle . spill) rs
-        spill r = CmmStore (regSlot r) (CmmReg $ CmmLocal r)
-        regSlot r@(LocalReg _ _) = CmmRegOff (CmmGlobal Sp) (sp_off - offset)
-          where offset = w + expectJust "lowerForeign" (Map.lookup (RegSlot r) areaMap)
-                sp_off = wORD_SIZE + expectJust "lowerForeign" (Map.lookup (CallArea area) areaMap)
-                area = if succ == entry then Old else Young succ
-                w = widthInBytes $ typeWidth $ localRegType r
-        -- Note: The successor must be a procpoint, and we have already split,
-        --       so we use a jump, not a branch.
-        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
-        jump = CmmCall { cml_target  = succLbl, cml_cont = Nothing
-                       , cml_args    = widthInBytes wordWidth ,cml_ret_args = 0
-                       , cml_ret_off = updfr_off}
-    graph' <- liftUniq $ labelAGraph bid $ catAGraphs (map M.mkMiddle m) <**>
-                                           suspend <**> midCall <**>
-                                           resume  <**> saveRetVals <**> M.mkLast jump
-    return $ blocks `mapUnion` toBlockMap graph'
-lowerSafeForeignCall _ _ _ _ _ _ = panic "lowerSafeForeignCall was passed something else"
-
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index d3d9ba4b41..a76ad6f00a 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -7,7 +7,8 @@
 
 module CmmCallConv (
   ParamLocation(..),
-  assignArgumentsPos
+  assignArgumentsPos,
+  globalArgRegs
 ) where
 
 #include "HsVersions.h"
@@ -53,7 +54,6 @@ assignArgumentsPos conv arg_ty reps = assignments
                ([_], PrimOpReturn)     -> allRegs
                (_,   PrimOpReturn)     -> getRegsWithNode
                (_,   Slow)             -> noRegs
-               _ -> pprPanic "Unknown calling convention" (ppr conv)
       -- The calling conventions first assign arguments to registers,
       -- then switch to the stack when we first run out of registers
       -- (even if there are still available registers for args of a different type).
@@ -130,18 +130,25 @@ getRegsWithNode =
   (intRegs, map FloatReg  floatRegNos, map DoubleReg doubleRegNos, map LongReg longRegNos)
     where intRegs = map VanillaReg vanillaRegNos
 
-allVanillaRegNos, allFloatRegNos, allDoubleRegNos, allLongRegNos :: [Int]
-allVanillaRegNos = regList mAX_Vanilla_REG
-allFloatRegNos	 = regList mAX_Float_REG
-allDoubleRegNos	 = regList mAX_Double_REG
-allLongRegNos	   = regList mAX_Long_REG
+allFloatRegs, allDoubleRegs, allLongRegs :: [GlobalReg]
+allVanillaRegs :: [VGcPtr -> GlobalReg]
+
+allVanillaRegs = map VanillaReg $ regList mAX_Vanilla_REG
+allFloatRegs   = map FloatReg   $ regList mAX_Float_REG
+allDoubleRegs  = map DoubleReg  $ regList mAX_Double_REG
+allLongRegs    = map LongReg    $ regList mAX_Long_REG
 
 regList :: Int -> [Int]
 regList n = [1 .. n]
 
 allRegs :: AvailRegs
-allRegs = (map VanillaReg allVanillaRegNos, map FloatReg allFloatRegNos,
-           map DoubleReg  allDoubleRegNos,  map LongReg  allLongRegNos)
+allRegs  = (allVanillaRegs, allFloatRegs, allDoubleRegs, allLongRegs)
 
 noRegs :: AvailRegs
-noRegs    = ([], [], [], [])
+noRegs  = ([], [], [], [])
+
+globalArgRegs :: [GlobalReg]
+globalArgRegs = map ($VGcPtr) allVanillaRegs ++
+                allFloatRegs ++
+                allDoubleRegs ++
+                allLongRegs
diff --git a/compiler/cmm/CmmCommonBlockElim.hs b/compiler/cmm/CmmCommonBlockElim.hs
index abbfd01156..614edf23a2 100644
--- a/compiler/cmm/CmmCommonBlockElim.hs
+++ b/compiler/cmm/CmmCommonBlockElim.hs
@@ -13,22 +13,22 @@ where
 import BlockId
 import Cmm
 import CmmUtils
+import CmmContFlowOpt
 import Prelude hiding (iterate, succ, unzip, zip)
 
-import Compiler.Hoopl
+import Hoopl hiding (ChangeFlag)
 import Data.Bits
 import qualified Data.List as List
 import Data.Word
-import FastString
-import Control.Monad
 import Outputable
 import UniqFM
-import Unique
 
 my_trace :: String -> SDoc -> a -> a
 my_trace = if False then pprTrace else \_ _ a -> a
 
--- Eliminate common blocks:
+-- -----------------------------------------------------------------------------
+-- Eliminate common blocks
+
 -- If two blocks are identical except for the label on the first node,
 -- then we can eliminate one of the blocks. To ensure that the semantics
 -- of the program are preserved, we have to rewrite each predecessor of the
@@ -42,59 +42,50 @@ my_trace = if False then pprTrace else \_ _ a -> a
 
 -- TODO: Use optimization fuel
 elimCommonBlocks :: CmmGraph -> CmmGraph
-elimCommonBlocks g =
-    upd_graph g . snd $ iterate common_block reset hashed_blocks
-                                (emptyUFM, mapEmpty)
-      where hashed_blocks    = map (\b -> (hash_block b, b)) (reverse (postorderDfs g))
-            reset (_, subst) = (emptyUFM, subst)
+elimCommonBlocks g = replaceLabels env g
+  where
+     env = iterate hashed_blocks mapEmpty
+     hashed_blocks = map (\b -> (hash_block b, b)) $ postorderDfs g
 
 -- Iterate over the blocks until convergence
-iterate :: (t -> a -> (Bool, t)) -> (t -> t) -> [a] -> t -> t
-iterate upd reset blocks state =
-  case foldl upd' (False, state) blocks of
-    (True,  state') -> iterate upd reset blocks (reset state')
-    (False, state') -> state'
-  where upd' (b, s) a = let (b', s') = upd s a in (b || b', s') -- lift to track changes
+iterate :: [(HashCode,CmmBlock)] -> BlockEnv BlockId -> BlockEnv BlockId
+iterate blocks subst =
+  case foldl common_block (False, emptyUFM, subst) blocks of
+    (changed,  _, subst)
+       | changed   -> iterate blocks subst
+       | otherwise -> subst
+
+type State  = (ChangeFlag, UniqFM [CmmBlock], BlockEnv BlockId)
+
+type ChangeFlag = Bool
+type HashCode = Int
 
 -- Try to find a block that is equal (or ``common'') to b.
-type BidMap = BlockEnv BlockId
-type State  = (UniqFM [CmmBlock], BidMap)
-common_block :: (Outputable h, Uniquable h) =>  State -> (h, CmmBlock) -> (Bool, State)
-common_block (bmap, subst) (hash, b) =
+common_block :: State -> (HashCode, CmmBlock) -> State
+common_block (old_change, bmap, subst) (hash, b) =
   case lookupUFM bmap hash of
     Just bs -> case (List.find (eqBlockBodyWith (eqBid subst) b) bs,
                      mapLookup bid subst) of
                  (Just b', Nothing)                         -> addSubst b'
                  (Just b', Just b'') | entryLabel b' /= b'' -> addSubst b'
-                 _ -> (False, (addToUFM bmap hash (b : bs), subst))
-    Nothing -> (False, (addToUFM bmap hash [b], subst))
+                                     | otherwise -> (old_change, bmap, subst)
+                 _ -> (old_change, addToUFM bmap hash (b : bs), subst)
+    Nothing -> (old_change, addToUFM bmap hash [b], subst)
   where bid = entryLabel b
-        addSubst b' = my_trace "found new common block" (ppr (entryLabel b')) $
-                      (True, (bmap, mapInsert bid (entryLabel b') subst))
-
--- Given the map ``subst'' from BlockId -> BlockId, we rewrite the graph.
-upd_graph :: CmmGraph -> BidMap -> CmmGraph
-upd_graph g subst = mapGraphNodes (id, middle, last) g
-  where middle = mapExpDeep exp
-        last l = last' (mapExpDeep exp l)
-        last' :: CmmNode O C -> CmmNode O C
-        last' (CmmBranch bid)              = CmmBranch $ sub bid
-        last' (CmmCondBranch p t f)        = cond p (sub t) (sub f)
-        last' (CmmCall t (Just bid) a r o) = CmmCall t (Just $ sub bid) a r o
-        last' l@(CmmCall _ Nothing _ _ _)  = l
-        last' (CmmForeignCall t r a bid u i) = CmmForeignCall t r a (sub bid) u i
-        last' (CmmSwitch e bs)             = CmmSwitch e $ map (liftM sub) bs
-        cond p t f = if t == f then CmmBranch t else CmmCondBranch p t f
-        exp (CmmStackSlot (CallArea (Young id))       off) =
-             CmmStackSlot (CallArea (Young (sub id))) off
-        exp (CmmLit (CmmBlock id)) = CmmLit (CmmBlock (sub id))
-        exp e = e
-        sub = lookupBid subst
+        addSubst b' = my_trace "found new common block" (ppr bid <> char '=' <> ppr (entryLabel b')) $
+                      (True, bmap, mapInsert bid (entryLabel b') subst)
+
+
+-- -----------------------------------------------------------------------------
+-- Hashing and equality on blocks
+
+-- Below here is mostly boilerplate: hashing blocks ignoring labels,
+-- and comparing blocks modulo a label mapping.
 
 -- To speed up comparisons, we hash each basic block modulo labels.
 -- The hashing is a bit arbitrary (the numbers are completely arbitrary),
 -- but it should be fast and good enough.
-hash_block :: CmmBlock -> Int
+hash_block :: CmmBlock -> HashCode
 hash_block block =
   fromIntegral (foldBlockNodesB3 (hash_fst, hash_mid, hash_lst) block (0 :: Word32) .&. (0x7fffffff :: Word32))
   -- UniqFM doesn't like negative Ints
@@ -103,13 +94,13 @@ hash_block block =
         hash_lst m h = hash_node m + h `shiftL` 1
 
         hash_node :: CmmNode O x -> Word32
-        hash_node (CmmComment (FastString u _ _ _ _)) = cvt u
+        hash_node (CmmComment _) = 0 -- don't care
         hash_node (CmmAssign r e) = hash_reg r + hash_e e
         hash_node (CmmStore e e') = hash_e e + hash_e e'
         hash_node (CmmUnsafeForeignCall t _ as) = hash_tgt t + hash_list hash_e as
-        hash_node (CmmBranch _) = 23 -- would be great to hash these properly
+        hash_node (CmmBranch _) = 23 -- NB. ignore the label
         hash_node (CmmCondBranch p _ _) = hash_e p
-        hash_node (CmmCall e _ _ _ _) = hash_e e
+        hash_node (CmmCall e _ _ _ _ _) = hash_e e
         hash_node (CmmForeignCall t _ _ _ _ _) = hash_tgt t
         hash_node (CmmSwitch e _) = hash_e e
 
@@ -143,25 +134,67 @@ hash_block block =
 -- Utilities: equality and substitution on the graph.
 
 -- Given a map ``subst'' from BlockID -> BlockID, we define equality.
-eqBid :: BidMap -> BlockId -> BlockId -> Bool
+eqBid :: BlockEnv BlockId -> BlockId -> BlockId -> Bool
 eqBid subst bid bid' = lookupBid subst bid == lookupBid subst bid'
-lookupBid :: BidMap -> BlockId -> BlockId
+lookupBid :: BlockEnv BlockId -> BlockId -> BlockId
 lookupBid subst bid = case mapLookup bid subst of
                         Just bid  -> lookupBid subst bid
                         Nothing -> bid
 
--- Equality on the body of a block, modulo a function mapping block IDs to block IDs.
+-- Middle nodes and expressions can contain BlockIds, in particular in
+-- CmmStackSlot and CmmBlock, so we have to use a special equality for
+-- these.
+--
+eqMiddleWith :: (BlockId -> BlockId -> Bool)
+             -> CmmNode O O -> CmmNode O O -> Bool
+eqMiddleWith _ (CmmComment _) (CmmComment _) = True
+eqMiddleWith eqBid (CmmAssign r1 e1) (CmmAssign r2 e2)
+  = r1 == r2 && eqExprWith eqBid e1 e2
+eqMiddleWith eqBid (CmmStore l1 r1) (CmmStore l2 r2)
+  = eqExprWith eqBid l1 l2 && eqExprWith eqBid r1 r2
+eqMiddleWith eqBid (CmmUnsafeForeignCall t1 r1 a1)
+                   (CmmUnsafeForeignCall t2 r2 a2)
+  = t1 == t2 && r1 == r2 && and (zipWith (eqExprWith eqBid) a1 a2)
+eqMiddleWith _ _ _ = False
+
+eqExprWith :: (BlockId -> BlockId -> Bool)
+           -> CmmExpr -> CmmExpr -> Bool
+eqExprWith eqBid = eq
+ where
+  CmmLit l1          `eq` CmmLit l2          = eqLit l1 l2
+  CmmLoad e1 _       `eq` CmmLoad e2 _       = e1 `eq` e2
+  CmmReg r1          `eq` CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    `eq` CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  `eq` CmmMachOp op2 es2  = op1==op2 && es1 `eqs` es2
+  CmmStackSlot a1 i1 `eq` CmmStackSlot a2 i2 = eqArea a1 a2 && i1==i2
+  _e1                `eq` _e2                = False
+
+  xs `eqs` ys = and (zipWith eq xs ys)
+
+  eqLit (CmmBlock id1) (CmmBlock id2) = eqBid id1 id2
+  eqLit l1 l2 = l1 == l2
+
+  eqArea Old Old = True
+  eqArea (Young id1) (Young id2) = eqBid id1 id2
+  eqArea _ _ = False
+
+-- Equality on the body of a block, modulo a function mapping block
+-- IDs to block IDs.
 eqBlockBodyWith :: (BlockId -> BlockId -> Bool) -> CmmBlock -> CmmBlock -> Bool
-eqBlockBodyWith eqBid block block' = middles == middles' && eqLastWith eqBid last last'
-  where (_, middles , JustC last  :: MaybeC C (CmmNode O C)) = blockToNodeList block
-        (_, middles', JustC last' :: MaybeC C (CmmNode O C)) = blockToNodeList block'
+eqBlockBodyWith eqBid block block'
+  = and (zipWith (eqMiddleWith eqBid) (blockToList m) (blockToList m')) &&
+    eqLastWith eqBid l l'
+  where (_,m,l)   = blockSplit block
+        (_,m',l') = blockSplit block'
+
+
 
 eqLastWith :: (BlockId -> BlockId -> Bool) -> CmmNode O C -> CmmNode O C -> Bool
 eqLastWith eqBid (CmmBranch bid1) (CmmBranch bid2) = eqBid bid1 bid2
 eqLastWith eqBid (CmmCondBranch c1 t1 f1) (CmmCondBranch c2 t2 f2) =
   c1 == c2 && eqBid t1 t2 && eqBid f1 f2
-eqLastWith eqBid (CmmCall t1 c1 a1 r1 u1) (CmmCall t2 c2 a2 r2 u2) =
-  t1 == t2 && eqMaybeWith eqBid c1 c2 && a1 == a2 && r1 == r2 && u1 == u2
+eqLastWith eqBid (CmmCall t1 c1 g1 a1 r1 u1) (CmmCall t2 c2 g2 a2 r2 u2) =
+  t1 == t2 && eqMaybeWith eqBid c1 c2 && a1 == a2 && r1 == r2 && u1 == u2 && g1 == g2
 eqLastWith eqBid (CmmSwitch e1 bs1) (CmmSwitch e2 bs2) =
   e1 == e2 && eqListWith (eqMaybeWith eqBid) bs1 bs2
 eqLastWith _ _ _ = False
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index 73ce57e93f..f9fa68062e 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -2,19 +2,19 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations -fno-warn-incomplete-patterns #-}
 
 module CmmContFlowOpt
-    ( runCmmContFlowOpts
-    , removeUnreachableBlocks, replaceBranches
+    ( cmmCfgOpts
+    , cmmCfgOptsProc
+    , removeUnreachableBlocks
+    , replaceLabels
     )
 where
 
 import BlockId
 import Cmm
 import CmmUtils
-import Digraph
 import Maybes
-import Outputable
 
-import Compiler.Hoopl
+import Hoopl
 import Control.Monad
 import Prelude hiding (succ, unzip, zip)
 
@@ -24,196 +24,189 @@ import Prelude hiding (succ, unzip, zip)
 --
 -----------------------------------------------------------------------------
 
-runCmmContFlowOpts :: CmmGroup -> CmmGroup
-runCmmContFlowOpts = map (optProc cmmCfgOpts)
-
 cmmCfgOpts :: CmmGraph -> CmmGraph
-cmmCfgOpts = removeUnreachableBlocks . blockConcat . branchChainElim
-        -- Here branchChainElim can ultimately be replaced
-        -- with a more exciting combination of optimisations
+cmmCfgOpts = removeUnreachableBlocks . blockConcat
+
+cmmCfgOptsProc :: CmmDecl -> CmmDecl
+cmmCfgOptsProc = optProc cmmCfgOpts
 
 optProc :: (g -> g) -> GenCmmDecl d h g -> GenCmmDecl d h g
 optProc opt (CmmProc info lbl g) = CmmProc info lbl (opt g)
 optProc _   top                  = top
 
+
 -----------------------------------------------------------------------------
 --
--- Branch Chain Elimination
+-- Block concatenation
 --
 -----------------------------------------------------------------------------
 
--- | Remove any basic block of the form L: goto L', and replace L with
--- L' everywhere else, unless L is the successor of a call instruction
--- and L' is the entry block. You don't want to set the successor of a
--- function call to the entry block because there is no good way to
--- store both the infotables for the call and from the callee, while
--- putting the stack pointer in a consistent place.
+-- This optimisation does two things:
+--   - If a block finishes with an unconditional branch, then we may
+--     be able to concatenate the block it points to and remove the
+--     branch.  We do this either if the destination block is small
+--     (e.g. just another branch), or if this is the only jump to
+--     this particular destination block.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto.
+--
+-- Both transformations are improved by working from the end of the
+-- graph towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- We need to walk over the blocks from the end back to the
+-- beginning.  We are going to maintain the "current" graph
+-- (BlockEnv CmmBlock) as we go, and also a mapping from BlockId
+-- to BlockId, representing continuation labels that we have
+-- renamed.  This latter mapping is important because we might
+-- shortcut a CmmCall continuation.  For example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--
+--    L: goto M
 --
--- JD isn't quite sure when it's safe to share continuations for different
--- function calls -- have to think about where the SP will be,
--- so we'll table that problem for now by leaving all call successors alone.
-
-branchChainElim :: CmmGraph -> CmmGraph
-branchChainElim g
-  | null lone_branch_blocks = g    -- No blocks to remove
-  | otherwise               = {- pprTrace "branchChainElim" (ppr forest) $ -}
-                              replaceLabels (mapFromList edges) g
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L).  So we keep track of
+-- which labels we have renamed and apply the mapping at the end
+-- with replaceLabels.
+
+blockConcat  :: CmmGraph -> CmmGraph
+blockConcat g@CmmGraph { g_entry = entry_id }
+  = replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks
   where
-    blocks = toBlockList g
-
-    lone_branch_blocks :: [(BlockId, BlockId)]
-      -- each (L,K) is a block of the form
-      --   L : goto K
-    lone_branch_blocks = mapCatMaybes isLoneBranch blocks
-
-    call_succs = foldl add emptyBlockSet blocks
-      where add :: BlockSet -> CmmBlock -> BlockSet
-            add succs b =
-              case lastNode b of
-                (CmmCall _ (Just k) _ _ _) -> setInsert k succs
-                (CmmForeignCall {succ=k})  -> setInsert k succs
-                _                          -> succs
-
-    isLoneBranch :: CmmBlock -> Maybe (BlockId, BlockId)
-    isLoneBranch block
-      | (JustC (CmmEntry id), [], JustC (CmmBranch target)) <- blockToNodeList block
-      , not (setMember id call_succs)
-      = Just (id,target)
-      | otherwise
-      = Nothing
-
-    -- We build a graph from lone_branch_blocks (every node has only
-    -- one out edge).  Then we
-    --   - topologically sort the graph: if from A we can reach B,
-    --     then A occurs before B in the result list.
-    --   - depth-first search starting from the nodes in this list.
-    --     This gives us a [[node]], in which each list is a dependency
-    --     chain.
-    --   - for each list [a1,a2,...an] replace branches to ai with an.
-    --
-    -- This approach nicely deals with cycles by ignoring them.
-    -- Branches in a cycle will be redirected to somewhere in the
-    -- cycle, but we don't really care where.  A cycle should be dead code,
-    -- and so will be eliminated by removeUnreachableBlocks.
-    --
-    fromNode (b,_) = b
-    toNode   a     = (a,a)
-
-    all_block_ids :: LabelSet
-    all_block_ids = setFromList (map fst lone_branch_blocks)
-                      `setUnion`
-                    setFromList (map snd lone_branch_blocks)
-
-    forest = dfsTopSortG $ graphFromVerticesAndAdjacency nodes lone_branch_blocks
-        where nodes = map toNode $ setElems $ all_block_ids
-
-    edges  = [ (fromNode y, fromNode x)
-             | (x:xs) <- map reverse forest, y <- xs ]
+     -- we might be able to shortcut the entry BlockId itself
+     new_entry
+       | Just entry_blk <- mapLookup entry_id new_blocks
+       , Just dest      <- canShortcut entry_blk
+       = dest
+       | otherwise
+       = entry_id
 
-----------------------------------------------------------------
+     blocks = postorderDfs g
+
+     (new_blocks, shortcut_map) =
+           foldr maybe_concat (toBlockMap g, mapEmpty) blocks
+
+     maybe_concat :: CmmBlock
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+     maybe_concat block (blocks, shortcut_map)
+        | CmmBranch b' <- last
+        , Just blk' <- mapLookup b' blocks
+        , shouldConcatWith b' blk'
+        = (mapInsert bid (splice head blk') blocks, shortcut_map)
+
+        -- calls: if we can shortcut the continuation label, then
+        -- we must *also* remember to substitute for the label in the
+        -- code, because we will push it somewhere.
+        | Just b'   <- callContinuation_maybe last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = (blocks, mapInsert b' dest shortcut_map)
+           -- replaceLabels will substitute dest for b' everywhere, later
+
+        -- non-calls: see if we can shortcut any of the successors.
+        | Nothing <- callContinuation_maybe last
+        = ( mapInsert bid (blockJoinTail head shortcut_last) blocks
+          , shortcut_map )
+
+        | otherwise
+        = (blocks, shortcut_map)
+        where
+          (head, last) = blockSplitTail block
+          bid = entryLabel block
+          shortcut_last = mapSuccessors shortcut last
+          shortcut l =
+             case mapLookup l blocks of
+               Just b | Just dest <- canShortcut b  -> dest
+               _otherwise -> l
+
+     shouldConcatWith b block
+       | num_preds b == 1    = True  -- only one predecessor: go for it
+       | okToDuplicate block = True  -- short enough to duplicate
+       | otherwise           = False
+       where num_preds bid = mapLookup bid backEdges `orElse` 0
+
+     canShortcut :: CmmBlock -> Maybe BlockId
+     canShortcut block
+       | (_, middle, CmmBranch dest) <- blockSplit block
+       , isEmptyBlock middle
+       = Just dest
+       | otherwise
+       = Nothing
+
+     backEdges :: BlockEnv Int -- number of predecessors for each block
+     backEdges = mapInsertWith (+) entry_id 1 $ -- add 1 for the entry id
+                   mapMap setSize $ predMap blocks
+
+     splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+     splice head rest = head `blockAppend` snd (blockSplitHead rest)
+
+
+callContinuation_maybe :: CmmNode O C -> Maybe BlockId
+callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
+callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
+callContinuation_maybe _ = Nothing
+
+okToDuplicate :: CmmBlock -> Bool
+okToDuplicate block
+  = case blockSplit block of
+      (_, m, CmmBranch _) -> isEmptyBlock m
+      -- cheap and cheerful; we might expand this in the future to
+      -- e.g. spot blocks that represent a single instruction or two.
+      -- Be careful: a CmmCall can be more than one instruction, it
+      -- has a CmmExpr inside it.
+      _otherwise -> False
+
+------------------------------------------------------------------------
+-- Map over the CmmGraph, replacing each label with its mapping in the
+-- supplied BlockEnv.
 
 replaceLabels :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceLabels env =
-  replace_eid . mapGraphNodes1 txnode
+replaceLabels env g
+  | mapNull env = g
+  | otherwise   = replace_eid $ mapGraphNodes1 txnode g
    where
      replace_eid g = g {g_entry = lookup (g_entry g)}
      lookup id = mapLookup id env `orElse` id
 
      txnode :: CmmNode e x -> CmmNode e x
      txnode (CmmBranch bid)         = CmmBranch (lookup bid)
-     txnode (CmmCondBranch p t f)   = CmmCondBranch (exp p) (lookup t) (lookup f)
+     txnode (CmmCondBranch p t f)   = mkCmmCondBranch (exp p) (lookup t) (lookup f)
      txnode (CmmSwitch e arms)      = CmmSwitch (exp e) (map (liftM lookup) arms)
-     txnode (CmmCall t k a res r)   = CmmCall (exp t) (liftM lookup k) a res r
+     txnode (CmmCall t k rg a res r) = CmmCall (exp t) (liftM lookup k) rg a res r
      txnode fc@CmmForeignCall{}     = fc{ args = map exp (args fc)
                                         , succ = lookup (succ fc) }
      txnode other                   = mapExpDeep exp other
 
      exp :: CmmExpr -> CmmExpr
      exp (CmmLit (CmmBlock bid))                = CmmLit (CmmBlock (lookup bid))
-     exp (CmmStackSlot (CallArea (Young id)) i) = CmmStackSlot (CallArea (Young (lookup id))) i
+     exp (CmmStackSlot (Young id) i) = CmmStackSlot (Young (lookup id)) i
      exp e                                      = e
 
-
-replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceBranches env g = mapGraphNodes (id, id, last) g
-  where
-    last :: CmmNode O C -> CmmNode O C
-    last (CmmBranch id)          = CmmBranch (lookup id)
-    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
-    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
-    last l@(CmmCall {})          = l
-    last l@(CmmForeignCall {})   = l
-    lookup id = fmap lookup (mapLookup id env) `orElse` id
-            -- XXX: this is a recursive lookup, it follows chains until the lookup
-            -- returns Nothing, at which point we return the last BlockId
+mkCmmCondBranch :: CmmExpr -> Label -> Label -> CmmNode O C
+mkCmmCondBranch p t f = if t == f then CmmBranch t else CmmCondBranch p t f
 
 ----------------------------------------------------------------
 -- Build a map from a block to its set of predecessors. Very useful.
+
 predMap :: [CmmBlock] -> BlockEnv BlockSet
 predMap blocks = foldr add_preds mapEmpty blocks -- find the back edges
   where add_preds block env = foldl (add (entryLabel block)) env (successors block)
         add bid env b' =
           mapInsert b' (setInsert bid (mapLookup b' env `orElse` setEmpty)) env
 
------------------------------------------------------------------------------
---
--- Block concatenation
---
------------------------------------------------------------------------------
-
--- If a block B branches to a label L, L is not the entry block,
--- and L has no other predecessors,
--- then we can splice the block starting with L onto the end of B.
--- Order matters, so we work bottom up (reverse postorder DFS).
--- This optimization can be inhibited by unreachable blocks, but
--- the reverse postorder DFS returns only reachable blocks.
---
--- To ensure correctness, we have to make sure that the BlockId of the block
--- we are about to eliminate is not named in another instruction.
---
--- Note: This optimization does _not_ subsume branch chain elimination.
-
-blockConcat  :: CmmGraph -> CmmGraph
-blockConcat g@(CmmGraph {g_entry=eid}) =
-  replaceLabels concatMap $ ofBlockMap (g_entry g) blocks'
-  where
-     blocks = postorderDfs g
-
-     (blocks', concatMap) =
-           foldr maybe_concat (toBlockMap g, mapEmpty) $ blocks
-
-     maybe_concat :: CmmBlock -> (LabelMap CmmBlock, LabelMap Label) -> (LabelMap CmmBlock, LabelMap Label)
-     maybe_concat b unchanged@(blocks', concatMap) =
-       let bid = entryLabel b
-       in case blockToNodeList b of
-            (JustC h, m, JustC (CmmBranch b')) ->
-               if canConcatWith b' then
-                 (mapInsert bid (splice blocks' h m b') blocks',
-                  mapInsert b' bid concatMap)
-               else unchanged
-            _ -> unchanged
-
-     num_preds bid = liftM setSize (mapLookup bid backEdges) `orElse` 0
-
-     canConcatWith b' = b' /= eid && num_preds b' == 1
-
-     backEdges = predMap blocks
-
-     splice :: forall map n e x.
-               IsMap map =>
-               map (Block n e x) -> n C O -> [n O O] -> KeyOf map -> Block n C x
-     splice blocks' h m bid' =
-          case mapLookup bid' blocks' of
-            Nothing -> panic "unknown successor block"
-            Just block | (_, m', l') <- blockToNodeList block
-                -> blockOfNodeList (JustC h, (m ++ m'), l')
-
 
 -----------------------------------------------------------------------------
 --
 -- Removing unreachable blocks
---
------------------------------------------------------------------------------
 
 removeUnreachableBlocks :: CmmGraph -> CmmGraph
 removeUnreachableBlocks g
diff --git a/compiler/cmm/CmmCvt.hs b/compiler/cmm/CmmCvt.hs
index 80c6079aac..204f26e24b 100644
--- a/compiler/cmm/CmmCvt.hs
+++ b/compiler/cmm/CmmCvt.hs
@@ -12,29 +12,25 @@ import CmmUtils
 import qualified OldCmm as Old
 import OldPprCmm ()
 
-import Compiler.Hoopl hiding ((<*>), mkLabel, mkBranch)
+import Hoopl hiding ((<*>), mkLabel, mkBranch)
 import Data.Maybe
 import Maybes
 import Outputable
 
 cmmOfZgraph :: CmmGroup -> Old.CmmGroup
 cmmOfZgraph tops = map mapTop tops
-  where mapTop (CmmProc h l g) = CmmProc (Old.CmmInfo Nothing Nothing (info_tbl h)) l (ofZgraph g)
+  where mapTop (CmmProc h l g) = CmmProc (info_tbl h) l (ofZgraph g)
         mapTop (CmmData s ds) = CmmData s ds
 
 data ValueDirection = Arguments | Results
 
-add_hints :: Convention -> ValueDirection -> [a] -> [Old.CmmHinted a]
+add_hints :: ForeignTarget -> ValueDirection -> [a] -> [Old.CmmHinted a]
 add_hints conv vd args = zipWith Old.CmmHinted args (get_hints conv vd)
 
-get_hints :: Convention -> ValueDirection -> [ForeignHint]
-get_hints (Foreign (ForeignConvention _ hints _)) Arguments = hints
-get_hints (Foreign (ForeignConvention _ _ hints)) Results   = hints
-get_hints _other_conv                             _vd       = repeat NoHint
-
-get_conv :: ForeignTarget -> Convention
-get_conv (PrimTarget _)       = NativeNodeCall -- JD: SUSPICIOUS
-get_conv (ForeignTarget _ fc) = Foreign fc
+get_hints :: ForeignTarget -> ValueDirection -> [ForeignHint]
+get_hints (ForeignTarget _ (ForeignConvention _ hints _)) Arguments = hints
+get_hints (ForeignTarget _ (ForeignConvention _ _ hints)) Results   = hints
+get_hints (PrimTarget _) _vd = repeat NoHint
 
 cmm_target :: ForeignTarget -> Old.CmmCallTarget
 cmm_target (PrimTarget op) = Old.CmmPrim op Nothing
@@ -89,8 +85,8 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                             CmmUnsafeForeignCall (PrimTarget MO_Touch) _ _ -> Old.CmmNop
                             CmmUnsafeForeignCall target ress args          -> 
                               Old.CmmCall (cmm_target target)
-                                          (add_hints (get_conv target) Results   ress)
-                                          (add_hints (get_conv target) Arguments args)
+                                          (add_hints target Results   ress)
+                                          (add_hints target Arguments args)
                                           Old.CmmMayReturn
 
                   last :: CmmNode O C -> () -> [Old.CmmStmt]
@@ -106,7 +102,7 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                               | otherwise -> [Old.CmmCondBranch expr tid, Old.CmmBranch fid]
                             CmmSwitch arg ids -> [Old.CmmSwitch arg ids]
                             -- ToDo: STG Live
-                            CmmCall e _ _ _ _ -> [Old.CmmJump e Nothing]
+                            CmmCall e _ r _ _ _ -> [Old.CmmJump e (Just r)]
                             CmmForeignCall {} -> panic "ofZgraph: CmmForeignCall"
                           tail_of bid = case foldBlockNodesB3 (first, middle, last) block () of
                                           Old.BasicBlock _ stmts -> stmts
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 6eb91e89ba..646ecb5c67 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,11 +14,11 @@ module CmmExpr
     , GlobalReg(..), globalRegType, spReg, hpReg, spLimReg, nodeReg, node, baseReg
     , VGcPtr(..), vgcFlag 	-- Temporary!
     , DefinerOfLocalRegs, UserOfLocalRegs, foldRegsDefd, foldRegsUsed, filterRegsUsed
-    , DefinerOfSlots, UserOfSlots, foldSlotsDefd, foldSlotsUsed
     , RegSet, emptyRegSet, elemRegSet, extendRegSet, deleteFromRegSet, mkRegSet
-            , plusRegSet, minusRegSet, timesRegSet
-    , regUsedIn, regSlot
-    , Area(..), AreaId(..), SubArea, SubAreaSet, AreaMap, isStackSlotOf
+            , plusRegSet, minusRegSet, timesRegSet, sizeRegSet, nullRegSet
+            , regSetToList
+    , regUsedIn
+    , Area(..)
     , module CmmMachOp
     , module CmmType
     )
@@ -31,9 +31,9 @@ import CmmMachOp
 import BlockId
 import CLabel
 import Unique
-import UniqSet
 
-import Data.Map (Map)
+import Data.Set (Set)
+import qualified Data.Set as Set
 
 -----------------------------------------------------------------------------
 --		CmmExpr
@@ -42,11 +42,12 @@ import Data.Map (Map)
 
 data CmmExpr
   = CmmLit CmmLit               -- Literal
-  | CmmLoad CmmExpr CmmType     -- Read memory location
-  | CmmReg CmmReg		-- Contents of register
+  | CmmLoad !CmmExpr !CmmType   -- Read memory location
+  | CmmReg !CmmReg              -- Contents of register
   | CmmMachOp MachOp [CmmExpr]  -- Machine operation (+, -, *, etc.)
-  | CmmStackSlot Area Int       -- addressing expression of a stack slot
-  | CmmRegOff CmmReg Int	
+  | CmmStackSlot Area {-# UNPACK #-} !Int
+                                -- addressing expression of a stack slot
+  | CmmRegOff !CmmReg Int
 	-- CmmRegOff reg i
 	--        ** is shorthand only, meaning **
 	-- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
@@ -62,20 +63,16 @@ instance Eq CmmExpr where	-- Equality ignores the types
   _e1               == _e2               = False
 
 data CmmReg 
-  = CmmLocal  LocalReg
+  = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
   deriving( Eq, Ord )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
 data Area
-  = RegSlot  LocalReg
-  | CallArea AreaId
-  deriving (Eq, Ord)
-
-data AreaId
   = Old            -- See Note [Old Area]
-  | Young BlockId
+  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
+                   -- See Note [Continuation BlockId] in CmmNode.
   deriving (Eq, Ord)
 
 {- Note [Old Area] 
@@ -94,15 +91,8 @@ necessarily at the young end of the Old area.
 
 End of note -}
 
-type SubArea    = (Area, Int, Int) -- area, offset, width
-type SubAreaSet = Map Area [SubArea]
-
-type AreaMap    = Map Area Int
-     -- Byte offset of the oldest byte of the Area, 
-     -- relative to the oldest byte of the Old Area
-
 data CmmLit
-  = CmmInt Integer  Width
+  = CmmInt !Integer  Width
 	-- Interpretation: the 2's complement representation of the value
 	-- is truncated to the specified size.  This is easier than trying
 	-- to keep the value within range, because we don't know whether
@@ -120,7 +110,11 @@ data CmmLit
         -- It is also used inside the NCG during when generating
         -- position-independent code. 
   | CmmLabelDiffOff CLabel CLabel Int   -- label1 - label2 + offset
-  | CmmBlock BlockId			-- Code label
+
+  | CmmBlock {-# UNPACK #-} !BlockId     -- Code label
+        -- Invariant: must be a continuation BlockId
+        -- See Note [Continuation BlockId] in CmmNode.
+
   | CmmHighStackMark -- stands for the max stack space used during a procedure
   deriving Eq
 
@@ -163,7 +157,7 @@ maybeInvertCmmExpr _ = Nothing
 -----------------------------------------------------------------------------
 
 data LocalReg
-  = LocalReg !Unique CmmType
+  = LocalReg {-# UNPACK #-} !Unique CmmType
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
@@ -189,22 +183,35 @@ localRegType (LocalReg _ rep) = rep
 -----------------------------------------------------------------------------
 
 -- | Sets of local registers
-type RegSet              =  UniqSet LocalReg
+
+-- These are used for dataflow facts, and a common operation is taking
+-- the union of two RegSets and then asking whether the union is the
+-- same as one of the inputs.  UniqSet isn't good here, because
+-- sizeUniqSet is O(n) whereas Set.size is O(1), so we use ordinary
+-- Sets.
+
+type RegSet              =  Set LocalReg
 emptyRegSet             :: RegSet
+nullRegSet              :: RegSet -> Bool
 elemRegSet              :: LocalReg -> RegSet -> Bool
 extendRegSet            :: RegSet -> LocalReg -> RegSet
 deleteFromRegSet        :: RegSet -> LocalReg -> RegSet
 mkRegSet                :: [LocalReg] -> RegSet
 minusRegSet, plusRegSet, timesRegSet :: RegSet -> RegSet -> RegSet
-
-emptyRegSet      = emptyUniqSet
-elemRegSet       = elementOfUniqSet
-extendRegSet     = addOneToUniqSet
-deleteFromRegSet = delOneFromUniqSet
-mkRegSet         = mkUniqSet
-minusRegSet      = minusUniqSet
-plusRegSet       = unionUniqSets
-timesRegSet      = intersectUniqSets
+sizeRegSet              :: RegSet -> Int
+regSetToList            :: RegSet -> [LocalReg]
+
+emptyRegSet      = Set.empty
+nullRegSet       = Set.null
+elemRegSet       = Set.member
+extendRegSet     = flip Set.insert
+deleteFromRegSet = flip Set.delete
+mkRegSet         = Set.fromList
+minusRegSet      = Set.difference
+plusRegSet       = Set.union
+timesRegSet      = Set.intersection
+sizeRegSet       = Set.size
+regSetToList     = Set.toList
 
 class UserOfLocalRegs a where
   foldRegsUsed :: (b -> LocalReg -> b) -> b -> a -> b
@@ -236,7 +243,7 @@ instance DefinerOfLocalRegs LocalReg where
     foldRegsDefd f z r = f z r
 
 instance UserOfLocalRegs RegSet where
-    foldRegsUsed f = foldUniqSet (flip f)
+    foldRegsUsed f = Set.fold (flip f)
 
 instance UserOfLocalRegs CmmExpr where
   foldRegsUsed f z e = expr z e
@@ -271,49 +278,6 @@ reg `regUsedIn` CmmMachOp _ es   = any (reg `regUsedIn`) es
 _   `regUsedIn` CmmStackSlot _ _ = False
 
 -----------------------------------------------------------------------------
---    Stack slots
------------------------------------------------------------------------------
-
-isStackSlotOf :: CmmExpr -> LocalReg -> Bool
-isStackSlotOf (CmmStackSlot (RegSlot r) _) r' = r == r'
-isStackSlotOf _ _ = False
-
-regSlot :: LocalReg -> CmmExpr
-regSlot r = CmmStackSlot (RegSlot r) (widthInBytes $ typeWidth $ localRegType r)
-
------------------------------------------------------------------------------
---    Stack slot use information for expressions and other types [_$_]
------------------------------------------------------------------------------
-
--- Fold over the area, the offset into the area, and the width of the subarea.
-class UserOfSlots a where
-  foldSlotsUsed :: (b -> SubArea -> b) -> b -> a -> b
-
-class DefinerOfSlots a where
-  foldSlotsDefd :: (b -> SubArea -> b) -> b -> a -> b
-
-instance UserOfSlots CmmExpr where
-  foldSlotsUsed f z e = expr z e
-    where expr z (CmmLit _)          = z
-          expr z (CmmLoad (CmmStackSlot a i) ty) = f z (a, i, widthInBytes $ typeWidth ty)
-          expr z (CmmLoad addr _)    = foldSlotsUsed f z addr
-          expr z (CmmReg _)          = z
-          expr z (CmmMachOp _ exprs) = foldSlotsUsed f z exprs
-          expr z (CmmRegOff _ _)     = z
-          expr z (CmmStackSlot _ _)  = z
-
-instance UserOfSlots a => UserOfSlots [a] where
-  foldSlotsUsed _ set [] = set
-  foldSlotsUsed f set (x:xs) = foldSlotsUsed f (foldSlotsUsed f set x) xs
-
-instance DefinerOfSlots a => DefinerOfSlots [a] where
-  foldSlotsDefd _ set [] = set
-  foldSlotsDefd f set (x:xs) = foldSlotsDefd f (foldSlotsDefd f set x) xs
-
-instance DefinerOfSlots SubArea where
-    foldSlotsDefd f z a = f z a
-
------------------------------------------------------------------------------
 --		Global STG registers
 -----------------------------------------------------------------------------
 
diff --git a/compiler/cmm/CmmInfo.hs b/compiler/cmm/CmmInfo.hs
index fe0c104d1c..a171faa057 100644
--- a/compiler/cmm/CmmInfo.hs
+++ b/compiler/cmm/CmmInfo.hs
@@ -19,6 +19,8 @@ import CmmUtils
 import CLabel
 import SMRep
 import Bitmap
+import Stream (Stream)
+import qualified Stream
 
 import Maybes
 import Constants
@@ -40,10 +42,16 @@ mkEmptyContInfoTable info_lbl
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-cmmToRawCmm :: Platform -> [Old.CmmGroup] -> IO [Old.RawCmmGroup]
+cmmToRawCmm :: Platform -> Stream IO Old.CmmGroup ()
+            -> IO (Stream IO Old.RawCmmGroup ())
 cmmToRawCmm platform cmms
   = do { uniqs <- mkSplitUniqSupply 'i'
-       ; return (initUs_ uniqs (mapM (concatMapM (mkInfoTable platform)) cmms)) }
+       ; let do_one uniqs cmm = do
+                case initUs uniqs $ concatMapM (mkInfoTable platform) cmm of
+                  (b,uniqs') -> return (uniqs',b)
+                  -- NB. strictness fixes a space leak.  DO NOT REMOVE.
+       ; return (Stream.mapAccumL do_one uniqs cmms >> return ())
+       }
 
 -- Make a concrete info table, represented as a list of CmmStatic
 -- (it can't be simply a list of Word, because the SRT field is
@@ -82,7 +90,7 @@ mkInfoTable :: Platform -> CmmDecl -> UniqSM [RawCmmDecl]
 mkInfoTable _ (CmmData sec dat) 
   = return [CmmData sec dat]
 
-mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
+mkInfoTable platform (CmmProc info entry_label blocks)
   | CmmNonInfoTable <- info   -- Code without an info table.  Easy.
   = return [CmmProc Nothing entry_label blocks]
                                
@@ -91,7 +99,8 @@ mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
        ; return (top_decls  ++
                  mkInfoTableAndCode info_lbl info_cts
                                     entry_label blocks) }
-  | otherwise = panic "mkInfoTable"  -- Patern match overlap check not clever enough
+  | otherwise = panic "mkInfoTable"
+                  -- Patern match overlap check not clever enough
 
 -----------------------------------------------------
 type InfoTableContents = ( [CmmLit]	     -- The standard part
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
new file mode 100644
index 0000000000..3ee06215bc
--- /dev/null
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -0,0 +1,1049 @@
+{-# LANGUAGE RecordWildCards, GADTs #-}
+module CmmLayoutStack (
+       cmmLayoutStack, setInfoTableStackMap, cmmSink
+  ) where
+
+import StgCmmUtils      ( callerSaveVolatileRegs ) -- XXX
+import StgCmmForeign    ( saveThreadState, loadThreadState ) -- XXX
+
+import Cmm
+import BlockId
+import CLabel
+import CmmUtils
+import MkGraph
+import Module
+import ForeignCall
+import CmmLive
+import CmmProcPoint
+import SMRep
+import Hoopl hiding ((<*>), mkLast, mkMiddle)
+import Constants
+import UniqSupply
+import Maybes
+import UniqFM
+import Util
+
+import FastString
+import Outputable
+import Data.Map (Map)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import Control.Monad.Fix
+import Data.Array as Array
+import Data.Bits
+import Data.List (nub, partition)
+import Control.Monad (liftM)
+
+#include "HsVersions.h"
+
+
+data StackSlot = Occupied | Empty
+     -- Occupied: a return address or part of an update frame
+
+instance Outputable StackSlot where
+  ppr Occupied = ptext (sLit "XXX")
+  ppr Empty    = ptext (sLit "---")
+
+-- All stack locations are expressed as positive byte offsets from the
+-- "base", which is defined to be the address above the return address
+-- on the stack on entry to this CmmProc.
+--
+-- Lower addresses have higher StackLocs.
+--
+type StackLoc = ByteOff
+
+{-
+ A StackMap describes the stack at any given point.  At a continuation
+ it has a particular layout, like this:
+
+         |             | <- base
+         |-------------|
+         |     ret0    | <- base + 8
+         |-------------|
+         .  upd frame  . <- base + sm_ret_off
+         |-------------|
+         |             |
+         .    vars     .
+         . (live/dead) .
+         |             | <- base + sm_sp - sm_args
+         |-------------|
+         |    ret1     |
+         .  ret vals   . <- base + sm_sp    (<--- Sp points here)
+         |-------------|
+
+Why do we include the final return address (ret0) in our stack map?  I
+have absolutely no idea, but it seems to be done that way consistently
+in the rest of the code generator, so I played along here. --SDM
+
+Note that we will be constructing an info table for the continuation
+(ret1), which needs to describe the stack down to, but not including,
+the update frame (or ret0, if there is no update frame).
+-}
+
+data StackMap = StackMap
+ {  sm_sp   :: StackLoc
+       -- ^ the offset of Sp relative to the base on entry
+       -- to this block.
+ ,  sm_args :: ByteOff
+       -- ^ the number of bytes of arguments in the area for this block
+       -- Defn: the offset of young(L) relative to the base is given by
+       -- (sm_sp - sm_args) of the StackMap for block L.
+ ,  sm_ret_off :: ByteOff
+       -- ^ Number of words of stack that we do not describe with an info
+       -- table, because it contains an update frame.
+ ,  sm_regs :: UniqFM (LocalReg,StackLoc)
+       -- ^ regs on the stack
+ }
+
+instance Outputable StackMap where
+  ppr StackMap{..} =
+     text "Sp = " <> int sm_sp $$
+     text "sm_args = " <> int sm_args $$
+     text "sm_ret_off = " <> int sm_ret_off $$
+     text "sm_regs = " <> ppr (eltsUFM sm_regs)
+
+
+cmmLayoutStack :: ProcPointSet -> ByteOff -> CmmGraph
+               -> UniqSM (CmmGraph, BlockEnv StackMap)
+cmmLayoutStack procpoints entry_args
+               graph0@(CmmGraph { g_entry = entry })
+  = do
+    pprTrace "cmmLayoutStack" (ppr entry_args) $ return ()
+    (graph, liveness) <- removeDeadAssignments graph0
+    pprTrace "liveness" (ppr liveness) $ return ()
+    let blocks = postorderDfs graph
+
+    (final_stackmaps, final_high_sp, new_blocks) <-
+          mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
+            layout procpoints liveness entry entry_args
+                   rec_stackmaps rec_high_sp blocks
+
+    new_blocks' <- mapM lowerSafeForeignCall new_blocks
+
+    pprTrace ("Sp HWM") (ppr final_high_sp) $
+       return (ofBlockList entry new_blocks', final_stackmaps)
+
+
+
+layout :: BlockSet                      -- proc points
+       -> BlockEnv CmmLive              -- liveness
+       -> BlockId                       -- entry
+       -> ByteOff                       -- stack args on entry
+
+       -> BlockEnv StackMap             -- [final] stack maps
+       -> ByteOff                       -- [final] Sp high water mark
+
+       -> [CmmBlock]                    -- [in] blocks
+
+       -> UniqSM
+          ( BlockEnv StackMap           -- [out] stack maps
+          , ByteOff                     -- [out] Sp high water mark
+          , [CmmBlock]                  -- [out] new blocks
+          )
+
+layout procpoints liveness entry entry_args final_stackmaps final_hwm blocks
+  = go blocks init_stackmap entry_args []
+  where
+    (updfr, cont_info)  = collectContInfo blocks
+
+    init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
+                                               , sm_args = entry_args
+                                               , sm_ret_off = updfr
+                                               , sm_regs = emptyUFM
+                                               }
+
+    go [] acc_stackmaps acc_hwm acc_blocks
+      = return (acc_stackmaps, acc_hwm, acc_blocks)
+
+    go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
+      = do
+       let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
+    
+       let stack0@StackMap { sm_sp = sp0 }
+               = mapFindWithDefault
+                     (pprPanic "no stack map for" (ppr entry_lbl))
+                     entry_lbl acc_stackmaps
+    
+       pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
+    
+       -- (a) Update the stack map to include the effects of
+       --     assignments in this block
+       let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
+    
+       -- (b) Insert assignments to reload all the live variables if this
+       --     block is a proc point
+       let middle1 = if entry_lbl `setMember` procpoints
+                        then foldr blockCons middle0 (insertReloads stack0)
+                        else middle0
+    
+       -- (c) Look at the last node and if we are making a call or
+       --     jumping to a proc point, we must save the live
+       --     variables, adjust Sp, and construct the StackMaps for
+       --     each of the successor blocks.  See handleLastNode for
+       --     details.
+       (middle2, sp_off, last1, fixup_blocks, out)
+           <- handleLastNode procpoints liveness cont_info
+                             acc_stackmaps stack1 middle0 last0
+    
+       pprTrace "layout(out)" (ppr out) $ return ()
+
+       -- (d) Manifest Sp: run over the nodes in the block and replace
+       --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
+       --
+       -- our block:
+       --    middle1          -- the original middle nodes
+       --    middle2          -- live variable saves from handleLastNode
+       --    Sp = Sp + sp_off -- Sp adjustment goes here
+       --    last1            -- the last node
+       --
+       let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
+
+           sp_high = final_hwm - entry_args
+              -- The stack check value is adjusted by the Sp offset on
+              -- entry to the proc, which is entry_args.  We are
+              -- assuming that we only do a stack check at the
+              -- beginning of a proc, and we don't modify Sp before the
+              -- check.
+
+           final_blocks = manifestSp final_stackmaps stack0 sp0 sp_high entry0
+                              middle_pre sp_off last1 fixup_blocks
+
+           acc_stackmaps' = mapUnion acc_stackmaps out
+
+           hwm' = maximum (acc_hwm : (sp0 - sp_off) : map sm_sp (mapElems out))
+
+       go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
+
+
+-- -----------------------------------------------------------------------------
+
+-- This doesn't seem right somehow.  We need to find out whether this
+-- proc will push some update frame material at some point, so that we
+-- can avoid using that area of the stack for spilling.  The
+-- updfr_space field of the CmmProc *should* tell us, but it doesn't
+-- (I think maybe it gets filled in later when we do proc-point
+-- splitting).
+--
+-- So we'll just take the max of all the cml_ret_offs.  This could be
+-- unnecessarily pessimistic, but probably not in the code we
+-- generate.
+
+collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
+collectContInfo blocks
+  = (maximum ret_offs, mapFromList (catMaybes mb_argss))
+ where
+  (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
+
+  get_cont b =
+     case lastNode b of
+        CmmCall { cml_cont = Just l, .. }
+           -> (Just (l, cml_ret_args), cml_ret_off)
+        CmmForeignCall { .. }
+           -> (Just (succ, 0), updfr) -- ??
+        _other -> (Nothing, 0)
+
+
+-- -----------------------------------------------------------------------------
+-- Updating the StackMap from middle nodes
+
+-- Look for loads from stack slots, and update the StackMap.  This is
+-- purely for optimisation reasons, so that we can avoid saving a
+-- variable back to a different stack slot if it is already on the
+-- stack.
+--
+-- This happens a lot: for example when function arguments are passed
+-- on the stack and need to be immediately saved across a call, we
+-- want to just leave them where they are on the stack.
+--
+procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
+procMiddle stackmaps node sm
+  = case node of
+     CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
+       -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
+        where loc = getStackLoc area off stackmaps
+     CmmAssign (CmmLocal r) _other
+       -> sm { sm_regs = delFromUFM (sm_regs sm) r }
+     _other
+       -> sm
+
+getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
+getStackLoc Old       n _         = n
+getStackLoc (Young l) n stackmaps =
+  case mapLookup l stackmaps of
+    Nothing -> pprPanic "getStackLoc" (ppr l)
+    Just sm -> sm_sp sm - sm_args sm + n
+
+
+-- -----------------------------------------------------------------------------
+-- Handling stack allocation for a last node
+
+-- We take a single last node and turn it into:
+--
+--    C1 (some statements)
+--    Sp = Sp + N
+--    C2 (some more statements)
+--    call f()          -- the actual last node
+--
+-- plus possibly some more blocks (we may have to add some fixup code
+-- between the last node and the continuation).
+--
+-- C1: is the code for saving the variables across this last node onto
+-- the stack, if the continuation is a call or jumps to a proc point.
+--
+-- C2: if the last node is a safe foreign call, we have to inject some
+-- extra code that goes *after* the Sp adjustment.
+
+handleLastNode
+   :: ProcPointSet -> BlockEnv CmmLive -> BlockEnv ByteOff
+   -> BlockEnv StackMap -> StackMap
+   -> Block CmmNode O O
+   -> CmmNode O C
+   -> UniqSM
+      ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
+      , ByteOff            -- amount to adjust Sp
+      , CmmNode O C        -- new last node
+      , [CmmBlock]         -- new blocks
+      , BlockEnv StackMap  -- stackmaps for the continuations
+      )
+
+handleLastNode procpoints liveness cont_info stackmaps
+               stack0@StackMap { sm_sp = sp0 } middle last
+ = case last of
+    --  At each return / tail call,
+    --  adjust Sp to point to the last argument pushed, which
+    --  is cml_args, after popping any other junk from the stack.
+    CmmCall{ cml_cont = Nothing, .. } -> do
+      let sp_off = sp0 - cml_args
+      return ([], sp_off, last, [], mapEmpty)
+
+    --  At each CmmCall with a continuation:
+    CmmCall{ cml_cont = Just cont_lbl, .. } ->
+       return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
+
+    CmmForeignCall{ succ = cont_lbl, .. } -> do
+       return $ lastCall cont_lbl wORD_SIZE wORD_SIZE (sm_ret_off stack0)
+            -- one word each for args and results: the return address
+
+    CmmBranch{..}     ->  handleProcPoints
+    CmmCondBranch{..} ->  handleProcPoints
+    CmmSwitch{..}     ->  handleProcPoints
+
+  where
+     -- Calls and ForeignCalls are handled the same way:
+     lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
+              -> ( [CmmNode O O]
+                 , ByteOff
+                 , CmmNode O C
+                 , [CmmBlock]
+                 , BlockEnv StackMap
+                 )
+     lastCall lbl cml_args cml_ret_args cml_ret_off
+      =  ( assignments
+         , spOffsetForCall sp0 cont_stack cml_args
+         , last
+         , [] -- no new blocks
+         , mapSingleton lbl cont_stack )
+      where
+         (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
+
+
+     prepareStack lbl cml_ret_args cml_ret_off
+       | Just cont_stack <- mapLookup lbl stackmaps
+             -- If we have already seen this continuation before, then
+             -- we just have to make the stack look the same:
+       = (fixupStack stack0 cont_stack, cont_stack)
+             -- Otherwise, we have to allocate the stack frame
+       | otherwise
+       = (save_assignments, new_cont_stack)
+       where
+        (new_cont_stack, save_assignments)
+           = setupStackFrame lbl liveness cml_ret_off cml_ret_args stack0
+
+
+     -- For other last nodes (branches), if any of the targets is a
+     -- proc point, we have to set up the stack to match what the proc
+     -- point is expecting.
+     --
+     handleProcPoints :: UniqSM ( [CmmNode O O]
+                                , ByteOff
+                                , CmmNode O C
+                                , [CmmBlock]
+                                , BlockEnv StackMap )
+
+     handleProcPoints
+         -- Note [diamond proc point]
+       | Just l <- futureContinuation middle
+       , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
+       = do
+         let cont_args = mapFindWithDefault 0 l cont_info
+             (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
+             out = mapFromList [ (l', cont_stack)
+                               | l' <- successors last ]
+         return ( assigs
+                , spOffsetForCall sp0 cont_stack wORD_SIZE
+                , last
+                , []
+                , out)
+
+        | otherwise = do
+          pps <- mapM handleProcPoint (successors last)
+          let lbl_map :: LabelMap Label
+              lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
+              fix_lbl l = mapLookup l lbl_map `orElse` l
+          return ( []
+                 , 0
+                 , mapSuccessors fix_lbl last
+                 , concat [ blk | (_,_,_,blk) <- pps ]
+                 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
+
+     -- For each proc point that is a successor of this block
+     --   (a) if the proc point already has a stackmap, we need to
+     --       shuffle the current stack to make it look the same.
+     --       We have to insert a new block to make this happen.
+     --   (b) otherwise, call "allocate live stack0" to make the
+     --       stack map for the proc point
+     handleProcPoint :: BlockId
+                     -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
+     handleProcPoint l
+        | not (l `setMember` procpoints) = return (l, l, stack0, [])
+        | otherwise = do
+           tmp_lbl <- liftM mkBlockId $ getUniqueM
+           let
+               (stack2, assigs) =
+                  case mapLookup l stackmaps of
+                    Just pp_sm -> (pp_sm, fixupStack stack0 pp_sm)
+                    Nothing    ->
+                      pprTrace "first visit to proc point"
+                                   (ppr l <+> ppr stack1) $
+                      (stack1, assigs)
+                      where
+                       cont_args = mapFindWithDefault 0 l cont_info
+                       (stack1, assigs) =
+                           setupStackFrame l liveness (sm_ret_off stack0)
+                                                       cont_args stack0
+
+               sp_off = sp0 - sm_sp stack2
+
+               block = blockJoin (CmmEntry tmp_lbl)
+                                 (maybeAddSpAdj sp_off (blockFromList assigs))
+                                 (CmmBranch l)
+           --
+           return (l, tmp_lbl, stack2, [block])
+
+
+  
+-- Sp is currently pointing to current_sp,
+-- we want it to point to
+--    (sm_sp cont_stack - sm_args cont_stack + args)
+-- so the difference is
+--    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
+spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
+spOffsetForCall current_sp cont_stack args
+  = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
+
+
+-- | create a sequence of assignments to establish the new StackMap,
+-- given the old StackMap.
+fixupStack :: StackMap -> StackMap -> [CmmNode O O]
+fixupStack old_stack new_stack = concatMap move new_locs
+ where
+     old_map :: Map LocalReg ByteOff
+     old_map  = Map.fromList (stackSlotRegs old_stack)
+     new_locs = stackSlotRegs new_stack
+
+     move (r,n)
+       | Just m <- Map.lookup r old_map, n == m = []
+       | otherwise = [CmmStore (CmmStackSlot Old n)
+                               (CmmReg (CmmLocal r))]
+
+
+
+setupStackFrame
+             :: BlockId                 -- label of continuation
+             -> BlockEnv CmmLive        -- liveness
+             -> ByteOff      -- updfr
+             -> ByteOff      -- bytes of return values on stack
+             -> StackMap     -- current StackMap
+             -> (StackMap, [CmmNode O O])
+
+setupStackFrame lbl liveness updfr_off ret_args stack0
+  = (cont_stack, assignments)
+  where
+      -- get the set of LocalRegs live in the continuation
+      live = mapFindWithDefault Set.empty lbl liveness
+
+      -- the stack from the base to updfr_off is off-limits.
+      -- our new stack frame contains:
+      --   * saved live variables
+      --   * the return address [young(C) + 8]
+      --   * the args for the call,
+      --     which are replaced by the return values at the return
+      --     point.
+
+      -- everything up to updfr_off is off-limits
+      -- stack1 contains updfr_off, plus everything we need to save
+      (stack1, assignments) = allocate updfr_off live stack0
+
+      -- And the Sp at the continuation is:
+      --   sm_sp stack1 + ret_args
+      cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
+                         , sm_args = ret_args
+                         , sm_ret_off = updfr_off
+                         }
+
+
+-- -----------------------------------------------------------------------------
+-- Note [diamond proc point]
+--
+-- This special case looks for the pattern we get from a typical
+-- tagged case expression:
+--
+--    Sp[young(L1)] = L1
+--    if (R1 & 7) != 0 goto L1 else goto L2
+--  L2:
+--    call [R1] returns to L1
+--  L1: live: {y}
+--    x = R1
+--
+-- If we let the generic case handle this, we get
+--
+--    Sp[-16] = L1
+--    if (R1 & 7) != 0 goto L1a else goto L2
+--  L2:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    call [R1] returns to L1
+--  L1a:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    goto L1
+--  L1:
+--    x = R1
+--
+-- The code for saving the live vars is duplicated in each branch, and
+-- furthermore there is an extra jump in the fast path (assuming L1 is
+-- a proc point, which it probably is if there is a heap check).
+--
+-- So to fix this we want to set up the stack frame before the
+-- conditional jump.  How do we know when to do this, and when it is
+-- safe?  The basic idea is, when we see the assignment
+-- 
+--   Sp[young(L)] = L
+-- 
+-- we know that
+--   * we are definitely heading for L
+--   * there can be no more reads from another stack area, because young(L)
+--     overlaps with it.
+--
+-- We don't necessarily know that everything live at L is live now
+-- (some might be assigned between here and the jump to L).  So we
+-- simplify and only do the optimisation when we see
+--
+--   (1) a block containing an assignment of a return address L
+--   (2) ending in a branch where one (and only) continuation goes to L,
+--       and no other continuations go to proc points.
+--
+-- then we allocate the stack frame for L at the end of the block,
+-- before the branch.
+--
+-- We could generalise (2), but that would make it a bit more
+-- complicated to handle, and this currently catches the common case.
+
+futureContinuation :: Block CmmNode O O -> Maybe BlockId
+futureContinuation middle = foldBlockNodesB f middle Nothing
+   where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
+         f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
+               = Just l
+         f _ r = r
+
+-- -----------------------------------------------------------------------------
+-- Saving live registers
+
+-- | Given a set of live registers and a StackMap, save all the registers
+-- on the stack and return the new StackMap and the assignments to do
+-- the saving.
+--
+allocate :: ByteOff -> RegSet -> StackMap -> (StackMap, [CmmNode O O])
+allocate ret_off live stackmap@StackMap{ sm_sp = sp0
+                                       , sm_regs = regs0 }
+ =
+  pprTrace "allocate" (ppr live $$ ppr stackmap) $
+
+   -- we only have to save regs that are not already in a slot
+   let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
+       regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
+   in
+
+   -- make a map of the stack
+   let stack = reverse $ Array.elems $
+               accumArray (\_ x -> x) Empty (1, toWords (max sp0 ret_off)) $
+                 ret_words ++ live_words
+            where ret_words =
+                   [ (x, Occupied)
+                   | x <- [ 1 .. toWords ret_off] ]
+                  live_words =
+                   [ (toWords x, Occupied)
+                   | (r,off) <- eltsUFM regs1,
+                     let w = localRegBytes r,
+                     x <- [ off, off-wORD_SIZE .. off - w + 1] ]
+   in
+
+   -- Pass over the stack: find slots to save all the new live variables,
+   -- choosing the oldest slots first (hence a foldr).
+   let
+       save slot ([], stack, n, assigs, regs) -- no more regs to save
+          = ([], slot:stack, n `plusW` 1, assigs, regs)
+       save slot (to_save, stack, n, assigs, regs)
+          = case slot of
+               Occupied ->  (to_save, Occupied:stack, n `plusW` 1, assigs, regs)
+               Empty
+                 | Just (stack', r, to_save') <-
+                       select_save to_save (slot:stack)
+                 -> let assig = CmmStore (CmmStackSlot Old n')
+                                         (CmmReg (CmmLocal r))
+                        n' = n `plusW` 1
+                   in
+                        (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
+
+                 | otherwise
+                 -> (to_save, slot:stack, n `plusW` 1, assigs, regs)
+
+       -- we should do better here: right now we'll fit the smallest first,
+       -- but it would make more sense to fit the biggest first.
+       select_save :: [LocalReg] -> [StackSlot]
+                   -> Maybe ([StackSlot], LocalReg, [LocalReg])
+       select_save regs stack = go regs []
+         where go []     _no_fit = Nothing
+               go (r:rs) no_fit
+                 | Just rest <- dropEmpty words stack
+                 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
+                 | otherwise
+                 = go rs (r:no_fit)
+                 where words = localRegWords r
+
+       -- fill in empty slots as much as possible
+       (still_to_save, save_stack, n, save_assigs, save_regs)
+          = foldr save (to_save, [], 0, [], []) stack
+
+       -- push any remaining live vars on the stack
+       (push_sp, push_assigs, push_regs)
+          = foldr push (n, [], []) still_to_save
+          where
+              push r (n, assigs, regs)
+                = (n', assig : assigs, (r,(r,n')) : regs)
+                where
+                  n' = n + localRegBytes r
+                  assig = CmmStore (CmmStackSlot Old n')
+                                   (CmmReg (CmmLocal r))
+
+       trim_sp
+          | not (null push_regs) = push_sp
+          | otherwise
+          = n `plusW` (- length (takeWhile isEmpty save_stack))
+
+       final_regs = regs1 `addListToUFM` push_regs
+                          `addListToUFM` save_regs
+
+   in
+  -- XXX should be an assert
+   if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
+
+   if (trim_sp .&. (wORD_SIZE - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
+
+   ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
+   , push_assigs ++ save_assigs )
+
+
+-- -----------------------------------------------------------------------------
+-- Manifesting Sp
+
+-- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
+-- block looks like this:
+--
+--    middle_pre       -- the middle nodes
+--    Sp = Sp + sp_off -- Sp adjustment goes here
+--    last             -- the last node
+--
+-- And we have some extra blocks too (that don't contain Sp adjustments)
+--
+-- The adjustment for middle_pre will be different from that for
+-- middle_post, because the Sp adjustment intervenes.
+--
+manifestSp
+   :: BlockEnv StackMap  -- StackMaps for other blocks
+   -> StackMap           -- StackMap for this block
+   -> ByteOff            -- Sp on entry to the block
+   -> ByteOff            -- SpHigh
+   -> CmmNode C O        -- first node
+   -> [CmmNode O O]      -- middle
+   -> ByteOff            -- sp_off
+   -> CmmNode O C        -- last node
+   -> [CmmBlock]         -- new blocks
+   -> [CmmBlock]         -- final blocks with Sp manifest
+
+manifestSp stackmaps stack0 sp0 sp_high
+           first middle_pre sp_off last fixup_blocks
+  = final_block : fixup_blocks'
+  where
+    area_off = getAreaOff stackmaps
+
+    adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
+    adj_pre_sp  = mapExpDeep (areaToSp sp0            sp_high area_off)
+    adj_post_sp = mapExpDeep (areaToSp (sp0 - sp_off) sp_high area_off)
+
+    final_middle = maybeAddSpAdj sp_off $
+                   blockFromList $
+                   map adj_pre_sp $
+                   elimStackStores stack0 stackmaps area_off $
+                   middle_pre
+
+    final_last    = optStackCheck (adj_post_sp last)
+
+    final_block   = blockJoin first final_middle final_last
+
+    fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
+
+
+getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
+getAreaOff _ Old = 0
+getAreaOff stackmaps (Young l) =
+  case mapLookup l stackmaps of
+    Just sm -> sm_sp sm - sm_args sm
+    Nothing -> pprPanic "getAreaOff" (ppr l)
+
+
+maybeAddSpAdj :: ByteOff -> Block CmmNode O O -> Block CmmNode O O
+maybeAddSpAdj 0      block = block
+maybeAddSpAdj sp_off block
+   = block `blockSnoc` CmmAssign spReg (cmmOffset (CmmReg spReg) sp_off)
+
+
+{-
+Sp(L) is the Sp offset on entry to block L relative to the base of the
+OLD area.
+
+SpArgs(L) is the size of the young area for L, i.e. the number of
+arguments.
+
+ - in block L, each reference to [old + N] turns into
+   [Sp + Sp(L) - N]
+
+ - in block L, each reference to [young(L') + N] turns into
+   [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
+
+ - be careful with the last node of each block: Sp has already been adjusted
+   to be Sp + Sp(L) - Sp(L')
+-}
+
+areaToSp :: ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
+areaToSp sp_old _sp_hwm area_off (CmmStackSlot area n) =
+  cmmOffset (CmmReg spReg) (sp_old - area_off area - n)
+areaToSp _ sp_hwm _ (CmmLit CmmHighStackMark) = CmmLit (mkIntCLit sp_hwm)
+areaToSp _ _ _ (CmmMachOp (MO_U_Lt _)  -- Note [null stack check]
+                      [CmmMachOp (MO_Sub _)
+                              [ CmmReg (CmmGlobal Sp)
+                              , CmmLit (CmmInt 0 _)],
+                       CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
+areaToSp _ _ _ other = other
+
+-- -----------------------------------------------------------------------------
+-- Note [null stack check]
+--
+-- If the high-water Sp is zero, then we end up with
+--
+--   if (Sp - 0 < SpLim) then .. else ..
+--
+-- and possibly some dead code for the failure case.  Optimising this
+-- away depends on knowing that SpLim <= Sp, so it is really the job
+-- of the stack layout algorithm, hence we do it now.  This is also
+-- convenient because control-flow optimisation later will drop the
+-- dead code.
+
+optStackCheck :: CmmNode O C -> CmmNode O C
+optStackCheck n = -- Note [null stack check]
+ case n of
+   CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
+   other -> other
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Eliminate stores of the form
+--
+--    Sp[area+n] = r
+--
+-- when we know that r is already in the same slot as Sp[area+n].  We
+-- could do this in a later optimisation pass, but that would involve
+-- a separate analysis and we already have the information to hand
+-- here.  It helps clean up some extra stack stores in common cases.
+--
+-- Note that we may have to modify the StackMap as we walk through the
+-- code using procMiddle, since an assignment to a variable in the
+-- StackMap will invalidate its mapping there.
+--
+elimStackStores :: StackMap
+                -> BlockEnv StackMap
+                -> (Area -> ByteOff)
+                -> [CmmNode O O]
+                -> [CmmNode O O]
+elimStackStores stackmap stackmaps area_off nodes
+  = go stackmap nodes
+  where
+    go _stackmap [] = []
+    go stackmap (n:ns)
+     = case n of
+         CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
+            | Just (_,off) <- lookupUFM (sm_regs stackmap) r
+            , area_off area + m == off
+            -> pprTrace "eliminated a node!" (ppr r) $ go stackmap ns
+         _otherwise
+            -> n : go (procMiddle stackmaps n stackmap) ns
+
+
+-- -----------------------------------------------------------------------------
+-- Update info tables to include stack liveness
+
+
+setInfoTableStackMap :: BlockEnv StackMap -> CmmDecl -> CmmDecl
+setInfoTableStackMap stackmaps
+    (CmmProc top_info@TopInfo{..} l g@CmmGraph{g_entry = eid})
+  = CmmProc top_info{ info_tbl = fix_info info_tbl } l g
+  where
+    fix_info info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
+       info_tbl { cit_rep = StackRep (get_liveness eid) }
+    fix_info other = other
+
+    get_liveness :: BlockId -> Liveness
+    get_liveness lbl
+      = case mapLookup lbl stackmaps of
+          Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl)
+          Just sm -> stackMapToLiveness sm
+
+setInfoTableStackMap _ d = d
+
+
+stackMapToLiveness :: StackMap -> Liveness
+stackMapToLiveness StackMap{..} =
+   reverse $ Array.elems $
+        accumArray (\_ x -> x) True (toWords sm_ret_off + 1,
+                                     toWords (sm_sp - sm_args)) live_words
+   where
+     live_words =  [ (toWords off, False)
+                   | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
+
+
+-- -----------------------------------------------------------------------------
+-- Lowering safe foreign calls
+
+{-
+Note [lower safe foreign calls]
+
+We start with
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | r1 = foo(x,y,z) returns to L1
+ '-----------------------
+ L1:
+   R1 = r1 -- copyIn, inserted by mkSafeCall
+   ...
+
+the stack layout algorithm will arrange to save and reload everything
+live across the call.  Our job now is to expand the call so we get
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | SAVE_THREAD_STATE()
+ | token = suspendThread(BaseReg, interruptible)
+ | r = foo(x,y,z)
+ | BaseReg = resumeThread(token)
+ | LOAD_THREAD_STATE()
+ | R1 = r  -- copyOut
+ | jump L1
+ '-----------------------
+ L1:
+   r = R1 -- copyIn, inserted by mkSafeCall
+   ...
+
+Note the copyOut, which saves the results in the places that L1 is
+expecting them (see Note {safe foreign call convention]).
+-}
+
+lowerSafeForeignCall :: CmmBlock -> UniqSM CmmBlock
+lowerSafeForeignCall block
+  | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
+  = do
+    -- Both 'id' and 'new_base' are KindNonPtr because they're
+    -- RTS-only objects and are not subject to garbage collection
+    id <- newTemp bWord
+    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
+    let (caller_save, caller_load) = callerSaveVolatileRegs
+    load_tso <- newTemp gcWord
+    load_stack <- newTemp gcWord
+    let suspend = saveThreadState <*>
+                  caller_save <*>
+                  mkMiddle (callSuspendThread id intrbl)
+        midCall = mkUnsafeCall tgt res args
+        resume  = mkMiddle (callResumeThread new_base id) <*>
+                  -- Assign the result to BaseReg: we
+                  -- might now have a different Capability!
+                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
+                  caller_load <*>
+                  loadThreadState load_tso load_stack
+        -- Note: The successor must be a procpoint, and we have already split,
+        --       so we use a jump, not a branch.
+        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
+
+        (ret_args, regs, copyout) = copyOutOflow NativeReturn Jump (Young succ)
+                                           (map (CmmReg . CmmLocal) res)
+                                           updfr (0, [])
+
+        jump = CmmCall { cml_target    = succLbl
+                       , cml_cont      = Just succ
+                       , cml_args_regs = regs
+                       , cml_args      = widthInBytes wordWidth
+                       , cml_ret_args  = ret_args
+                       , cml_ret_off   = updfr }
+
+    graph' <- lgraphOfAGraph $ suspend <*>
+                               midCall <*>
+                               resume  <*>
+                               copyout <*>
+                               mkLast jump
+
+    case toBlockList graph' of
+      [one] -> let (_, middle', last) = blockSplit one
+               in return (blockJoin entry (middle `blockAppend` middle') last)
+      _ -> panic "lowerSafeForeignCall0"
+
+  -- Block doesn't end in a safe foreign call:
+  | otherwise = return block
+
+
+foreignLbl :: FastString -> CmmExpr
+foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
+
+newTemp :: CmmType -> UniqSM LocalReg
+newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
+
+callSuspendThread :: LocalReg -> Bool -> CmmNode O O
+callSuspendThread id intrbl =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "suspendThread"))
+             (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
+       [id] [CmmReg (CmmGlobal BaseReg), CmmLit (mkIntCLit (fromEnum intrbl))]
+
+callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
+callResumeThread new_base id =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "resumeThread"))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint]))
+       [new_base] [CmmReg (CmmLocal id)]
+
+-- -----------------------------------------------------------------------------
+
+plusW :: ByteOff -> WordOff -> ByteOff
+plusW b w = b + w * wORD_SIZE
+
+dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
+dropEmpty 0 ss           = Just ss
+dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
+dropEmpty _ _            = Nothing
+
+isEmpty :: StackSlot -> Bool
+isEmpty Empty = True
+isEmpty _ = False
+
+localRegBytes :: LocalReg -> ByteOff
+localRegBytes r = roundUpToWords (widthInBytes (typeWidth (localRegType r)))
+
+localRegWords :: LocalReg -> WordOff
+localRegWords = toWords . localRegBytes
+
+toWords :: ByteOff -> WordOff
+toWords x = x `quot` wORD_SIZE
+
+
+insertReloads :: StackMap -> [CmmNode O O]
+insertReloads stackmap =
+   [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
+                                     (localRegType r))
+   | (r,sp) <- stackSlotRegs stackmap
+   ]
+
+
+stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
+stackSlotRegs sm = eltsUFM (sm_regs sm)
+
+-- -----------------------------------------------------------------------------
+
+-- If we do this *before* stack layout, we might be able to avoid
+-- saving some things across calls/procpoints.
+--
+-- *but*, that will invalidate the liveness analysis, and we'll have
+-- to re-do it.
+
+cmmSink :: CmmGraph -> UniqSM CmmGraph
+cmmSink graph = do
+  let liveness = cmmLiveness graph
+  return $ cmmSink' liveness graph
+
+cmmSink' :: BlockEnv CmmLive -> CmmGraph -> CmmGraph
+cmmSink' liveness graph
+  = ofBlockList (g_entry graph) $ sink mapEmpty $ postorderDfs graph
+  where
+
+  sink :: BlockEnv [(LocalReg, CmmExpr)] -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    pprTrace "sink" (ppr l) $
+    blockJoin first final_middle last : sink sunk' bs
+    where
+      l = entryLabel b
+      (first, middle, last) = blockSplit b
+      (middle', assigs) = walk (blockToList middle) emptyBlock
+                               (mapFindWithDefault [] l sunk)
+
+      (dropped_last, assigs') = partition (`conflictsWithLast` last) assigs
+
+      final_middle = foldl blockSnoc middle' (toNodes dropped_last)
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filt assigs' (getLive l))
+                             | l <- successors last ]
+           where
+               getLive l = mapFindWithDefault Set.empty l liveness
+               filt as live = [ (r,e) | (r,e) <- as, r `Set.member` live ]
+
+
+walk :: [CmmNode O O] -> Block CmmNode O O -> [(LocalReg, CmmExpr)]
+     -> (Block CmmNode O O, [(LocalReg, CmmExpr)])
+
+walk []     acc as = (acc, as)
+walk (n:ns) acc as
+  | Just a <- collect_it  = walk ns acc (a:as)
+  | otherwise             = walk ns (foldr (flip blockSnoc) acc (n:drop_nodes)) as'
+  where
+    collect_it = case n of
+                   CmmAssign (CmmLocal r) e@(CmmReg (CmmGlobal _)) -> Just (r,e)
+--                   CmmAssign (CmmLocal r) e@(CmmLoad addr _) |
+--                      foldRegsUsed (\b r -> False) True addr -> Just (r,e)
+                   _ -> Nothing
+
+    drop_nodes = toNodes dropped
+    (dropped, as') = partition should_drop as
+       where should_drop a = a `conflicts` n
+
+toNodes :: [(LocalReg,CmmExpr)] -> [CmmNode O O]
+toNodes as = [ CmmAssign (CmmLocal r) rhs | (r,rhs) <- as ]
+
+-- We only sink "r = G" assignments right now, so conflicts is very simple:
+conflicts :: (LocalReg,CmmExpr) -> CmmNode O O -> Bool
+(_, rhs) `conflicts` CmmAssign reg  _  | reg `regUsedIn` rhs = True
+--(r, CmmLoad _ _) `conflicts` CmmStore _ _ = True
+(r, _)   `conflicts` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
+
+conflictsWithLast :: (LocalReg,CmmExpr) -> CmmNode O C -> Bool
+(r, _) `conflictsWithLast` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 01ebac6254..2e24dd7f82 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -1,67 +1,70 @@
 -----------------------------------------------------------------------------
 --
--- (c) The University of Glasgow 2004-2006
+-- (c) The University of Glasgow 2011
 --
 -- CmmLint: checking the correctness of Cmm statements and expressions
 --
 -----------------------------------------------------------------------------
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
+{-# LANGUAGE GADTs #-}
 module CmmLint (
-  cmmLint, cmmLintTop
+    cmmLint, cmmLintGraph
   ) where
 
+import Hoopl
+import Cmm
+import CmmUtils
+import PprCmm ()
 import BlockId
-import OldCmm
-import CLabel
+import FastString
 import Outputable
-import OldPprCmm()
 import Constants
-import FastString
-import Platform
 
 import Data.Maybe
 
+-- Things to check:
+--     - invariant on CmmBlock in CmmExpr (see comment there)
+--     - check for branches to blocks that don't exist
+--     - check types
+
 -- -----------------------------------------------------------------------------
 -- Exported entry points:
 
 cmmLint :: (Outputable d, Outputable h)
-        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+        => GenCmmGroup d h CmmGraph -> Maybe SDoc
+cmmLint tops = runCmmLint (mapM_ lintCmmDecl) tops
 
-cmmLintTop :: (Outputable d, Outputable h)
-           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+cmmLintGraph :: CmmGraph -> Maybe SDoc
+cmmLintGraph g = runCmmLint lintCmmGraph g
 
-runCmmLint :: Outputable a
-           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
-runCmmLint _ l p =
+runCmmLint :: Outputable a => (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint l p =
    case unCL (l p) of
-   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
-                           nest 2 err,
-                           ptext $ sLit ("Program was:"),
-                           nest 2 (ppr p)])
-   Right _  -> Nothing
-
-lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
-lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
-  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
-        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
-	in  mapM_ (lintCmmBlock platform labels) blocks
-
-lintCmmDecl _ (CmmData {})
+     Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                             nest 2 err,
+                             ptext $ sLit ("Program was:"),
+                             nest 2 (ppr p)])
+     Right _  -> Nothing
+
+lintCmmDecl :: GenCmmDecl h i CmmGraph -> CmmLint ()
+lintCmmDecl (CmmProc _ lbl g)
+  = addLintInfo (text "in proc " <> ppr lbl) $ lintCmmGraph g
+lintCmmDecl (CmmData {})
   = return ()
 
-lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
-lintCmmBlock platform labels (BasicBlock id stmts)
-  = addLintInfo (text "in basic block " <> ppr id) $
-	mapM_ (lintCmmStmt platform labels) stmts
+
+lintCmmGraph :: CmmGraph -> CmmLint ()
+lintCmmGraph g = mapM_ (lintCmmBlock labels) blocks
+  where
+       blocks = toBlockList g
+       labels = setFromList (map entryLabel blocks)
+
+
+lintCmmBlock :: BlockSet -> CmmBlock -> CmmLint ()
+lintCmmBlock labels block
+  = addLintInfo (text "in basic block " <> ppr (entryLabel block)) $ do
+        let (_, middle, last) = blockSplit block
+        mapM_ lintCmmMiddle (blockToList middle)
+        lintCmmLast labels last
 
 -- -----------------------------------------------------------------------------
 -- lintCmmExpr
@@ -69,24 +72,24 @@ lintCmmBlock platform labels (BasicBlock id stmts)
 -- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
 -- byte/word mismatches.
 
-lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
-lintCmmExpr platform (CmmLoad expr rep) = do
-  _ <- lintCmmExpr platform expr
+lintCmmExpr :: CmmExpr -> CmmLint CmmType
+lintCmmExpr (CmmLoad expr rep) = do
+  _ <- lintCmmExpr expr
   -- Disabled, if we have the inlining phase before the lint phase,
   -- we can have funny offsets due to pointer tagging. -- EZY
   -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
   --   cmmCheckWordAddress expr
   return rep
-lintCmmExpr platform expr@(CmmMachOp op args) = do
-  tys <- mapM (lintCmmExpr platform) args
+lintCmmExpr expr@(CmmMachOp op args) = do
+  tys <- mapM lintCmmExpr args
   if map (typeWidth . cmmExprType) args == machOpArgReps op
-  	then cmmCheckMachOp op args tys
-	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
-lintCmmExpr platform (CmmRegOff reg offset)
-  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
-		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+        then cmmCheckMachOp op args tys
+        else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr (CmmRegOff reg offset)
+  = lintCmmExpr (CmmMachOp (MO_Add rep)
+                [CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
   where rep = typeWidth (cmmRegType reg)
-lintCmmExpr _ expr =
+lintCmmExpr expr =
   return (cmmExprType expr)
 
 -- Check for some common byte/word mismatches (eg. Sp + 1)
@@ -119,43 +122,61 @@ notNodeReg :: CmmExpr -> Bool
 notNodeReg (CmmReg reg) | reg == nodeReg = False
 notNodeReg _                             = True
 
-lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
-lintCmmStmt platform labels = lint
-    where lint (CmmNop) = return ()
-          lint (CmmComment {}) = return ()
-          lint stmt@(CmmAssign reg expr) = do
-            erep <- lintCmmExpr platform expr
-	    let reg_ty = cmmRegType reg
+lintCmmMiddle :: CmmNode O O -> CmmLint ()
+lintCmmMiddle node = case node of
+  CmmComment _ -> return ()
+
+  CmmAssign reg expr -> do
+            erep <- lintCmmExpr expr
+            let reg_ty = cmmRegType reg
             if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
                 then return ()
-                else cmmLintAssignErr stmt erep reg_ty
-          lint (CmmStore l r) = do
-            _ <- lintCmmExpr platform l
-            _ <- lintCmmExpr platform r
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+
+  CmmStore l r -> do
+            _ <- lintCmmExpr l
+            _ <- lintCmmExpr r
             return ()
-          lint (CmmCall target _res args _) =
-              do lintTarget platform labels target
-                 mapM_ (lintCmmExpr platform . hintlessCmm) args
-          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
-          lint (CmmSwitch e branches) = do
+
+  CmmUnsafeForeignCall target _formals actuals -> do
+            lintTarget target
+            mapM_ lintCmmExpr actuals
+
+
+lintCmmLast :: BlockSet -> CmmNode O C -> CmmLint ()
+lintCmmLast labels node = case node of
+  CmmBranch id -> checkTarget id
+
+  CmmCondBranch e t f -> do
+            mapM_ checkTarget [t,f]
+            _ <- lintCmmExpr e
+            checkCond e
+
+  CmmSwitch e branches -> do
             mapM_ checkTarget $ catMaybes branches
-            erep <- lintCmmExpr platform e
+            erep <- lintCmmExpr e
             if (erep `cmmEqType_ignoring_ptrhood` bWord)
               then return ()
-              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
-                               text " :: " <> ppr erep)
-          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
-          lint (CmmReturn) = return ()
-          lint (CmmBranch id) = checkTarget id
-          checkTarget id = if setMember id labels then return ()
-                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
-
-lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
-lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
-                                                return ()
-lintTarget _        _      (CmmPrim _ Nothing) = return ()
-lintTarget platform labels (CmmPrim _ (Just stmts))
-    = mapM_ (lintCmmStmt platform labels) stmts
+              else cmmLintErr (text "switch scrutinee is not a word: " <>
+                               ppr e <> text " :: " <> ppr erep)
+
+  CmmCall { cml_target = target, cml_cont = cont } -> do
+          _ <- lintCmmExpr target
+          maybe (return ()) checkTarget cont
+
+  CmmForeignCall tgt _ args succ _ _ -> do
+          lintTarget tgt
+          mapM_ lintCmmExpr args
+          checkTarget succ
+ where
+  checkTarget id
+     | setMember id labels = return ()
+     | otherwise = cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+
+lintTarget :: ForeignTarget -> CmmLint ()
+lintTarget (ForeignTarget e _) = lintCmmExpr e >> return ()
+lintTarget (PrimTarget {})     = return ()
 
 
 checkCond :: CmmExpr -> CmmLint ()
@@ -163,7 +184,7 @@ checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
 checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
 checkCond expr
     = cmmLintErr (hang (text "expression is not a conditional:") 2
-                       (ppr expr))
+                         (ppr expr))
 
 -- -----------------------------------------------------------------------------
 -- CmmLint monad
@@ -173,37 +194,36 @@ checkCond expr
 newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
 
 instance Monad CmmLint where
-  CmmLint m >>= k = CmmLint $ case m of 
-				Left e -> Left e
-				Right a -> unCL (k a)
+  CmmLint m >>= k = CmmLint $ case m of
+                                Left e -> Left e
+                                Right a -> unCL (k a)
   return a = CmmLint (Right a)
 
 cmmLintErr :: SDoc -> CmmLint a
 cmmLintErr msg = CmmLint (Left msg)
 
 addLintInfo :: SDoc -> CmmLint a -> CmmLint a
-addLintInfo info thing = CmmLint $ 
+addLintInfo info thing = CmmLint $
    case unCL thing of
-	Left err -> Left (hang info 2 err)
-	Right a  -> Right a
+        Left err -> Left (hang info 2 err)
+        Right a  -> Right a
 
 cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
 cmmLintMachOpErr expr argsRep opExpectsRep
-     = cmmLintErr (text "in MachOp application: " $$ 
-					nest 2 (ppr expr) $$
-				        (text "op is expecting: " <+> ppr opExpectsRep) $$
-					(text "arguments provide: " <+> ppr argsRep))
+     = cmmLintErr (text "in MachOp application: " $$
+                   nest 2 (ppr  expr) $$
+                      (text "op is expecting: " <+> ppr opExpectsRep) $$
+                      (text "arguments provide: " <+> ppr argsRep))
 
-cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr :: CmmNode e x -> CmmType -> CmmType -> CmmLint a
 cmmLintAssignErr stmt e_ty r_ty
-  = cmmLintErr (text "in assignment: " $$ 
-		nest 2 (vcat [ppr stmt, 
-			      text "Reg ty:" <+> ppr r_ty,
-			      text "Rhs ty:" <+> ppr e_ty]))
-			 
-					
+  = cmmLintErr (text "in assignment: " $$
+                nest 2 (vcat [ppr stmt,
+                              text "Reg ty:" <+> ppr r_ty,
+                              text "Rhs ty:" <+> ppr e_ty]))
+
 
 cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
 cmmLintDubiousWordOffset expr
    = cmmLintErr (text "offset is not a multiple of words: " $$
-			nest 2 (ppr expr))
+                 nest 2 (ppr expr))
diff --git a/compiler/cmm/CmmLive.hs b/compiler/cmm/CmmLive.hs
index 9a5bb2d5ae..f0163fefc4 100644
--- a/compiler/cmm/CmmLive.hs
+++ b/compiler/cmm/CmmLive.hs
@@ -11,17 +11,15 @@ module CmmLive
     )
 where
 
+import UniqSupply
 import BlockId
 import Cmm
 import CmmUtils
-import Control.Monad
-import OptimizationFuel
 import PprCmmExpr ()
 
-import Compiler.Hoopl
+import Hoopl
 import Maybes
 import Outputable
-import UniqSet
 
 -----------------------------------------------------------------------------
 -- Calculating what variables are live on entry to a basic block
@@ -33,8 +31,10 @@ type CmmLive = RegSet
 -- | The dataflow lattice
 liveLattice :: DataflowLattice CmmLive
 liveLattice = DataflowLattice "live LocalReg's" emptyRegSet add
-    where add _ (OldFact old) (NewFact new) = case unionUniqSets old new of
-            join -> (changeIf $ sizeUniqSet join > sizeUniqSet old, join)
+    where add _ (OldFact old) (NewFact new) =
+               (changeIf $ sizeRegSet join > sizeRegSet old, join)
+              where !join = plusRegSet old new
+
 
 -- | A mapping from block labels to the variables live on entry
 type BlockEntryLiveness = BlockEnv CmmLive
@@ -43,16 +43,17 @@ type BlockEntryLiveness = BlockEnv CmmLive
 -- | Calculated liveness info for a CmmGraph
 -----------------------------------------------------------------------------
 
-cmmLiveness :: CmmGraph -> FuelUniqSM BlockEntryLiveness
+cmmLiveness :: CmmGraph -> BlockEntryLiveness
 cmmLiveness graph =
-  liftM check $ liftM snd $ dataflowPassBwd graph [] $ analBwd liveLattice xferLive
+  check $ dataflowAnalBwd graph [] $ analBwd liveLattice xferLive
   where entry = g_entry graph
-        check facts = noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts
+        check facts = noLiveOnEntry entry
+                        (expectJust "check" $ mapLookup entry facts) facts
 
 -- | On entry to the procedure, there had better not be any LocalReg's live-in.
 noLiveOnEntry :: BlockId -> CmmLive -> a -> a
 noLiveOnEntry bid in_fact x =
-  if isEmptyUniqSet in_fact then x
+  if nullRegSet in_fact then x
   else pprPanic "LocalReg's live-in to graph" (ppr bid <+> ppr in_fact)
 
 -- | The transfer equations use the traditional 'gen' and 'kill'
@@ -60,42 +61,42 @@ noLiveOnEntry bid in_fact x =
 gen  :: UserOfLocalRegs a    => a -> RegSet -> RegSet
 gen  a live = foldRegsUsed extendRegSet      live a
 kill :: DefinerOfLocalRegs a => a -> RegSet -> RegSet
-kill a live = foldRegsDefd delOneFromUniqSet live a
+kill a live = foldRegsDefd deleteFromRegSet live a
 
-gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a) => a -> CmmLive -> CmmLive
+gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a)
+          => a -> CmmLive -> CmmLive
 gen_kill a = gen a . kill a
 
 -- | The transfer function
--- EZY: Bits of this analysis are duplicated in CmmSpillReload, though
--- it's not really easy to efficiently reuse all of this.  Keep in mind
--- if you need to update this analysis.
 xferLive :: BwdTransfer CmmNode CmmLive
 xferLive = mkBTransfer3 fst mid lst
   where fst _ f = f
         mid :: CmmNode O O -> CmmLive -> CmmLive
         mid n f = gen_kill n f
         lst :: CmmNode O C -> FactBase CmmLive -> CmmLive
-        -- slightly inefficient: kill is unnecessary for emptyRegSet
-        lst n f = gen_kill n
-                $ case n of CmmCall{}        -> emptyRegSet
-                            CmmForeignCall{} -> emptyRegSet
-                            _                -> joinOutFacts liveLattice n f
+        lst n f = gen_kill n $ joinOutFacts liveLattice n f
 
 -----------------------------------------------------------------------------
 -- Removing assignments to dead variables
 -----------------------------------------------------------------------------
 
-removeDeadAssignments :: CmmGraph -> FuelUniqSM CmmGraph
+removeDeadAssignments :: CmmGraph -> UniqSM (CmmGraph, BlockEnv CmmLive)
 removeDeadAssignments g =
-   liftM fst $ dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
-   where rewrites = deepBwdRw3 nothing middle nothing
-         -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-         -- but GHC panics while compiling, see bug #4045.
+   dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
+   where rewrites = mkBRewrite3 nothing middle nothing
+         -- SDM: no need for deepBwdRw here, we only rewrite to empty
+         -- Beware: deepBwdRw with one polymorphic function seems more
+         -- reasonable here, but GHC panics while compiling, see bug
+         -- #4045.
          middle :: CmmNode O O -> Fact O CmmLive -> CmmReplGraph O O
-         middle (CmmAssign (CmmLocal reg') _) live | not (reg' `elemRegSet` live) = return $ Just emptyGraph
+         middle (CmmAssign (CmmLocal reg') _) live
+                 | not (reg' `elemRegSet` live)
+                 = return $ Just emptyGraph
          -- XXX maybe this should be somewhere else...
-         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs = return $ Just emptyGraph
-         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs = return $ Just emptyGraph
+         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs
+                 = return $ Just emptyGraph
+         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs
+                 = return $ Just emptyGraph
          middle _ _ = return Nothing
 
          nothing :: CmmNode e x -> Fact x CmmLive -> CmmReplGraph e x
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index 4844af9d9a..0a5f5170f0 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -18,7 +18,7 @@ module CmmNode (
      CmmNode(..), ForeignHint(..), CmmFormal, CmmActual,
      UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..),
      mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
-     mapExpM, mapExpDeepM, wrapRecExpM
+     mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors
   ) where
 
 import CmmExpr
@@ -35,15 +35,17 @@ import Prelude hiding (succ)
 ------------------------
 -- CmmNode
 
+#define ULabel {-# UNPACK #-} !Label
+
 data CmmNode e x where
-  CmmEntry :: Label -> CmmNode C O
+  CmmEntry :: ULabel -> CmmNode C O
 
   CmmComment :: FastString -> CmmNode O O
 
-  CmmAssign :: CmmReg -> CmmExpr -> CmmNode O O
+  CmmAssign :: !CmmReg -> !CmmExpr -> CmmNode O O
     -- Assign to register
 
-  CmmStore :: CmmExpr -> CmmExpr -> CmmNode O O
+  CmmStore :: !CmmExpr -> !CmmExpr -> CmmNode O O
     -- Assign to memory location.  Size is
     -- given by cmmExprType of the rhs.
 
@@ -60,11 +62,12 @@ data CmmNode e x where
       --            bug for what can be put in arguments, see
       --            Note [Register Parameter Passing]
 
-  CmmBranch :: Label -> CmmNode O C  -- Goto another block in the same procedure
+  CmmBranch :: ULabel -> CmmNode O C
+                                   -- Goto another block in the same procedure
 
   CmmCondBranch :: {                 -- conditional branch
       cml_pred :: CmmExpr,
-      cml_true, cml_false :: Label
+      cml_true, cml_false :: ULabel
   } -> CmmNode O C
 
   CmmSwitch :: CmmExpr -> [Maybe Label] -> CmmNode O C -- Table branch
@@ -78,15 +81,20 @@ data CmmNode e x where
 
       cml_cont :: Maybe Label,
           -- Label of continuation (Nothing for return or tail call)
-
--- ToDO: add this:
---       cml_args_regs :: [GlobalReg],
--- It says which GlobalRegs are live for the parameters at the
--- moment of the call.  Later stages can use this to give liveness
--- everywhere, which in turn guides register allocation.
--- It is the companion of cml_args; cml_args says which stack words
--- hold parameters, while cml_arg_regs says which global regs hold parameters.
--- But do note [Register parameter passing]
+          --
+          -- Note [Continuation BlockId]: these BlockIds are called
+          -- Continuation BlockIds, and are the only BlockIds that can
+          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
+          -- (CmmStackSlot (Young b) _).
+
+      cml_args_regs :: [GlobalReg],
+          -- The argument GlobalRegs (Rx, Fx, Dx, Lx) that are passed
+          -- to the call.  This is essential information for the
+          -- native code generator's register allocator; without
+          -- knowing which GlobalRegs are live it has to assume that
+          -- they are all live.  This list should only include
+          -- GlobalRegs that are mapped to real machine registers on
+          -- the target platform.
 
       cml_args :: ByteOff,
           -- Byte offset, from the *old* end of the Area associated with
@@ -117,7 +125,7 @@ data CmmNode e x where
       tgt   :: ForeignTarget,   -- call target and convention
       res   :: [CmmFormal],     -- zero or more results
       args  :: [CmmActual],     -- zero or more arguments; see Note [Register parameter passing]
-      succ  :: Label,           -- Label of continuation
+      succ  :: ULabel,          -- Label of continuation
       updfr :: UpdFrameOffset,  -- where the update frame is (for building infotable)
       intrbl:: Bool             -- whether or not the call is interruptible
   } -> CmmNode O C
@@ -181,7 +189,7 @@ instance Eq (CmmNode e x) where
   (CmmBranch a)                == (CmmBranch a')                  = a==a'
   (CmmCondBranch a b c)        == (CmmCondBranch a' b' c')        = a==a' && b==b' && c==c'
   (CmmSwitch a b)              == (CmmSwitch a' b')               = a==a' && b==b'
-  (CmmCall a b c d e)          == (CmmCall a' b' c' d' e')        = a==a' && b==b' && c==c' && d==d' && e==e'
+  (CmmCall a b c d e f)          == (CmmCall a' b' c' d' e' f')   = a==a' && b==b' && c==c' && d==d' && e==e' && f==f'
   (CmmForeignCall a b c d e f) == (CmmForeignCall a' b' c' d' e' f') = a==a' && b==b' && c==c' && d==d' && e==e' && f==f'
   _                            == _                               = False
 
@@ -198,10 +206,6 @@ instance NonLocal CmmNode where
   successors (CmmForeignCall {succ=l}) = [l]
 
 
-instance HooplNode CmmNode where
-  mkBranchNode label = CmmBranch label
-  mkLabelNode label  = CmmEntry label
-
 --------------------------------------------------
 -- Various helper types
 
@@ -218,14 +222,6 @@ data Convention
   | GC               -- Entry to the garbage collector: uses the node reg!
   | PrimOpCall       -- Calling prim ops
   | PrimOpReturn     -- Returning from prim ops
-  | Foreign          -- Foreign call/return
-        ForeignConvention
-  | Private
-        -- Used for control transfers within a (pre-CPS) procedure All
-        -- jump sites known, never pushed on the stack (hence no SRT)
-        -- You can choose whatever calling convention you please
-        -- (provided you make sure all the call sites agree)!
-        -- This data type eventually to be extended to record the convention.
   deriving( Eq )
 
 data ForeignConvention
@@ -283,37 +279,6 @@ instance DefinerOfLocalRegs (CmmNode e x) where
           fold f z n = foldRegsDefd f z n
 
 
-instance UserOfSlots (CmmNode e x) where
-  foldSlotsUsed f z n = case n of
-    CmmAssign _ expr -> fold f z expr
-    CmmStore addr rval -> fold f (fold f z addr) rval
-    CmmUnsafeForeignCall _ _ args -> fold f z args
-    CmmCondBranch expr _ _ -> fold f z expr
-    CmmSwitch expr _ -> fold f z expr
-    CmmCall {cml_target=tgt} -> fold f z tgt
-    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
-    _ -> z
-    where fold :: forall a b.
-                       UserOfSlots a =>
-                       (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsUsed f z n
-
-instance UserOfSlots ForeignTarget where
-  foldSlotsUsed  f z (ForeignTarget e _) = foldSlotsUsed f z e
-  foldSlotsUsed _f z (PrimTarget _)      = z
-
-instance DefinerOfSlots (CmmNode e x) where
-  foldSlotsDefd f z n = case n of
-    CmmStore (CmmStackSlot a i) expr -> f z (a, i, widthInBytes $ typeWidth $ cmmExprType expr)
-    CmmForeignCall {res=res} -> fold f z $ map foreign_call_slot res
-    _ -> z
-    where
-          fold :: forall a b.
-                  DefinerOfSlots a =>
-                  (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsDefd f z n
-          foreign_call_slot r = case widthInBytes $ typeWidth $ localRegType r of w -> (RegSlot r, w, w)
-
 -----------------------------------
 -- mapping Expr in CmmNode
 
@@ -336,7 +301,7 @@ mapExp f   (CmmUnsafeForeignCall tgt fs as)      = CmmUnsafeForeignCall (mapFore
 mapExp _ l@(CmmBranch _)                         = l
 mapExp f   (CmmCondBranch e ti fi)               = CmmCondBranch (f e) ti fi
 mapExp f   (CmmSwitch e tbl)                     = CmmSwitch (f e) tbl
-mapExp f   (CmmCall tgt mb_id o i s)             = CmmCall (f tgt) mb_id o i s
+mapExp f   n@CmmCall {cml_target=tgt}            = n{cml_target = f tgt}
 mapExp f   (CmmForeignCall tgt fs as succ updfr intrbl) = CmmForeignCall (mapForeignTarget f tgt) fs (map f as) succ updfr intrbl
 
 mapExpDeep :: (CmmExpr -> CmmExpr) -> CmmNode e x -> CmmNode e x
@@ -362,7 +327,7 @@ mapExpM f (CmmStore addr e)         = (\[addr', e'] -> CmmStore addr' e') `fmap`
 mapExpM _ (CmmBranch _)             = Nothing
 mapExpM f (CmmCondBranch e ti fi)   = (\x -> CmmCondBranch x ti fi) `fmap` f e
 mapExpM f (CmmSwitch e tbl)         = (\x -> CmmSwitch x tbl)       `fmap` f e
-mapExpM f (CmmCall tgt mb_id o i s) = (\x -> CmmCall x mb_id o i s) `fmap` f tgt
+mapExpM f (CmmCall tgt mb_id r o i s) = (\x -> CmmCall x mb_id r o i s) `fmap` f tgt
 mapExpM f (CmmUnsafeForeignCall tgt fs as)
     = case mapForeignTargetM f tgt of
         Just tgt' -> Just (CmmUnsafeForeignCall tgt' fs (mapListJ f as))
@@ -416,4 +381,20 @@ foldExp f (CmmCall {cml_target=tgt}) z            = f tgt z
 foldExp f (CmmForeignCall {tgt=tgt, args=args}) z = foldr f (foldExpForeignTarget f tgt z) args
 
 foldExpDeep :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
-foldExpDeep f = foldExp $ wrapRecExpf f
+foldExpDeep f = foldExp go
+  where -- go :: CmmExpr -> z -> z
+        go e@(CmmMachOp _ es) z = gos es $! f e z
+        go e@(CmmLoad addr _) z = go addr $! f e z
+        go e                  z = f e z
+
+        gos [] z = z
+        gos (e:es) z = gos es $! f e z
+
+-- -----------------------------------------------------------------------------
+
+mapSuccessors :: (Label -> Label) -> CmmNode O C -> CmmNode O C
+mapSuccessors f (CmmBranch bid)        = CmmBranch (f bid)
+mapSuccessors f (CmmCondBranch p y n)  = CmmCondBranch p (f y) (f n)
+mapSuccessors f (CmmSwitch e arms)     = CmmSwitch e (map (fmap f) arms)
+mapSuccessors _ n = n
+
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 8cc18fc1ca..8ff04cfa7b 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -145,8 +145,7 @@ To inline _smi:
 -}
 
 countUses :: UserOfLocalRegs a => a -> UniqFM Int
-countUses a = foldRegsUsed (\m r -> addToUFM m r (count m r + 1)) emptyUFM a
-  where count m r = lookupWithDefaultUFM m (0::Int) r
+countUses a = foldRegsUsed (\m r -> addToUFM_C (+) m r 1) emptyUFM a
 
 cmmMiniInline :: DynFlags -> [CmmBasicBlock] -> [CmmBasicBlock]
 cmmMiniInline dflags blocks = map do_inline blocks
@@ -157,25 +156,16 @@ cmmMiniInlineStmts :: DynFlags -> UniqFM Int -> [CmmStmt] -> [CmmStmt]
 cmmMiniInlineStmts _      _    [] = []
 cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr) : stmts)
         -- not used: just discard this assignment
-  | Nothing <- lookupUFM uses u
+  | 0 <- lookupWithDefaultUFM uses 0 u
   = cmmMiniInlineStmts dflags uses stmts
 
-        -- used (literal): try to inline at all the use sites
-  | Just n <- lookupUFM uses u, isLit expr
-  =
-     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u expr stmts of
-         (m, stmts')
-             | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
-             | otherwise ->
-                 stmt : cmmMiniInlineStmts dflags (adjustUFM (\x -> x - m) uses u) stmts'
-
-        -- used (foldable to literal): try to inline at all the use sites
+        -- used (foldable to small thing): try to inline at all the use sites
   | Just n <- lookupUFM uses u,
-    e@(CmmLit _) <- wrapRecExp foldExp expr
+    e <- wrapRecExp foldExp expr,
+    isTiny e
   =
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u e stmts of
+     case lookForInlineMany u e stmts of
          (m, stmts')
              | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
              | otherwise ->
@@ -188,6 +178,11 @@ cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr)
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
      cmmMiniInlineStmts dflags uses stmts'
  where
+  isTiny (CmmLit _) = True
+  isTiny (CmmReg (CmmGlobal _)) = True
+         -- not CmmLocal: that might invalidate the usage analysis results
+  isTiny _ = False
+
   platform = targetPlatform dflags
   foldExp (CmmMachOp op args) = cmmMachOpFold platform op args
   foldExp e = e
@@ -201,26 +196,28 @@ cmmMiniInlineStmts platform uses (stmt:stmts)
 -- register, and a list of statements.  Inlines the expression at all
 -- use sites of the register.  Returns the number of substituations
 -- made and the, possibly modified, list of statements.
-lookForInlineLit :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
-lookForInlineLit _ _ [] = (0, [])
-lookForInlineLit u expr stmts@(stmt : rest)
-  | Just n <- lookupUFM (countUses stmt) u
-  = case lookForInlineLit u expr rest of
-      (m, stmts) -> let z = n + m
-                    in z `seq` (z, inlineStmt u expr stmt : stmts)
-
-  | ok_to_skip
-  = case lookForInlineLit u expr rest of
+lookForInlineMany :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany u expr stmts = lookForInlineMany' u expr regset stmts
+    where regset = foldRegsUsed extendRegSet emptyRegSet expr
+
+lookForInlineMany' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany' _ _ _ [] = (0, [])
+lookForInlineMany' u expr regset stmts@(stmt : rest)
+  | Just n <- lookupUFM (countUses stmt) u, okToInline expr stmt
+  = let stmt' = inlineStmt u expr stmt in
+    if okToSkip stmt' u expr regset
+       then case lookForInlineMany' u expr regset rest of
+                       (m, stmts) -> let z = n + m
+                                     in z `seq` (z, stmt' : stmts)
+       else (n, stmt' : rest)
+
+  | okToSkip stmt u expr regset
+  = case lookForInlineMany' u expr regset rest of
       (n, stmts) -> (n, stmt : stmts)
 
   | otherwise
   = (0, stmts)
-  where
-    -- We skip over assignments to registers, unless the register
-    -- being assigned to is the one we're inlining.
-    ok_to_skip = case stmt of
-        CmmAssign (CmmLocal (LocalReg u' _)) _ | u' == u -> False
-        _other -> True
+
 
 lookForInline :: Unique -> CmmExpr -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline u expr stmts = lookForInline' u expr regset stmts
@@ -229,10 +226,10 @@ lookForInline u expr stmts = lookForInline' u expr regset stmts
 lookForInline' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline' _ _    _      [] = panic "lookForInline' []"
 lookForInline' u expr regset (stmt : rest)
-  | Just 1 <- lookupUFM (countUses stmt) u, ok_to_inline
+  | Just 1 <- lookupUFM (countUses stmt) u, okToInline expr stmt
   = Just (inlineStmt u expr stmt : rest)
 
-  | ok_to_skip
+  | okToSkip stmt u expr regset
   = case lookForInline' u expr regset rest of
            Nothing    -> Nothing
            Just stmts -> Just (stmt:stmts)
@@ -240,31 +237,37 @@ lookForInline' u expr regset (stmt : rest)
   | otherwise 
   = Nothing
 
-  where
-        -- we don't inline into CmmCall if the expression refers to global
-        -- registers.  This is a HACK to avoid global registers clashing with
-        -- C argument-passing registers, really the back-end ought to be able
-        -- to handle it properly, but currently neither PprC nor the NCG can
-        -- do it.  See also CgForeignCall:load_args_into_temps.
-    ok_to_inline = case stmt of
-                     CmmCall{} -> hasNoGlobalRegs expr
-                     _ -> True
-
-   -- Expressions aren't side-effecting.  Temporaries may or may not
-   -- be single-assignment depending on the source (the old code
-   -- generator creates single-assignment code, but hand-written Cmm
-   -- and Cmm from the new code generator is not single-assignment.)
-   -- So we do an extra check to make sure that the register being
-   -- changed is not one we were relying on.  I don't know how much of a
-   -- performance hit this is (we have to create a regset for every
-   -- instruction.) -- EZY
-    ok_to_skip = case stmt of
-                 CmmNop -> True
-                 CmmComment{} -> True
-                 CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
-                 CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
-                 _other -> False
 
+-- we don't inline into CmmCall if the expression refers to global
+-- registers.  This is a HACK to avoid global registers clashing with
+-- C argument-passing registers, really the back-end ought to be able
+-- to handle it properly, but currently neither PprC nor the NCG can
+-- do it.  See also CgForeignCall:load_args_into_temps.
+okToInline :: CmmExpr -> CmmStmt -> Bool
+okToInline expr CmmCall{} = hasNoGlobalRegs expr
+okToInline _ _ = True
+
+-- Expressions aren't side-effecting.  Temporaries may or may not
+-- be single-assignment depending on the source (the old code
+-- generator creates single-assignment code, but hand-written Cmm
+-- and Cmm from the new code generator is not single-assignment.)
+-- So we do an extra check to make sure that the register being
+-- changed is not one we were relying on.  I don't know how much of a
+-- performance hit this is (we have to create a regset for every
+-- instruction.) -- EZY
+okToSkip :: CmmStmt -> Unique -> CmmExpr -> RegSet -> Bool
+okToSkip stmt u expr regset
+   = case stmt of
+         CmmNop -> True
+         CmmComment{} -> True
+         CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
+         CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
+         CmmStore _ _ -> not_a_load expr
+         _other -> False
+  where
+    not_a_load (CmmMachOp _ args) = all not_a_load args
+    not_a_load (CmmLoad _ _) = False
+    not_a_load _ = True
 
 inlineStmt :: Unique -> CmmExpr -> CmmStmt -> CmmStmt
 inlineStmt u a (CmmAssign r e) = CmmAssign r (inlineExpr u a e)
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 075ed22ea9..f46d49e022 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -230,35 +230,31 @@ lits	:: { [ExtFCode CmmExpr] }
 
 cmmproc :: { ExtCode }
 -- TODO: add real SRT/info tables to parsed Cmm
-	: info maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
-		{ do ((entry_ret_label, info, live, formals, gc_block, frame), stmts) <-
+        : info maybe_formals_without_hints '{' body '}'
+                { do ((entry_ret_label, info, live, formals), stmts) <-
 		       getCgStmtsEC' $ loopDecls $ do {
 		         (entry_ret_label, info, live) <- $1;
 		         formals <- sequence $2;
-		         gc_block <- $3;
-		         frame <- $4;
-		         $6;
-		         return (entry_ret_label, info, live, formals, gc_block, frame) }
+                         $4;
+                         return (entry_ret_label, info, live, formals) }
 		     blks <- code (cgStmtsToBlocks stmts)
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo gc_block frame info) formals blks) }
+                     code (emitInfoTableAndCode entry_ret_label info formals blks) }
 
 	| info maybe_formals_without_hints ';'
 		{ do (entry_ret_label, info, live) <- $1;
 		     formals <- sequence $2;
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo Nothing Nothing info) formals []) }
+                     code (emitInfoTableAndCode entry_ret_label info formals []) }
 
-	| NAME maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
+        | NAME maybe_formals_without_hints '{' body '}'
 		{% withThisPackage $ \pkg ->
 		   do	newFunctionName $1 pkg
-		   	((formals, gc_block, frame), stmts) <-
+                        (formals, stmts) <-
 			 	getCgStmtsEC' $ loopDecls $ do {
 		          		formals <- sequence $2;
-		          		gc_block <- $3;
-			  		frame <- $4;
-		          		$6;
-		          		return (formals, gc_block, frame) }
+                                        $4;
+                                        return formals }
 			blks <- code (cgStmtsToBlocks stmts)
-			code (emitProc (CmmInfo gc_block frame CmmNonInfoTable) (mkCmmCodeLabel pkg $1) formals blks) }
+                        code (emitProc CmmNonInfoTable (mkCmmCodeLabel pkg $1) formals blks) }
 
 info	:: { ExtFCode (CLabel, CmmInfoTable, [Maybe LocalReg]) }
 	: 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
@@ -599,18 +595,7 @@ formals_without_hints :: { [ExtFCode LocalReg] }
 formal_without_hint :: { ExtFCode LocalReg }
 	: type NAME		{ newLocal $1 $2 }
 
-maybe_frame :: { ExtFCode (Maybe UpdateFrame) }
-	: {- empty -}			{ return Nothing }
-	| 'jump' expr '(' exprs0 ')'	{ do { target <- $2;
-					       args <- sequence $4;
-					       return $ Just (UpdateFrame target args) } }
-
-maybe_gc_block :: { ExtFCode (Maybe BlockId) }
-	: {- empty -}			{ return Nothing }
-	| 'goto' NAME
-		{ do l <- lookupLabel $2; return (Just l) }
-
-type	:: { CmmType }
+type    :: { CmmType }
 	: 'bits8'		{ b8 }
 	| typenot8		{ $1 }
 
@@ -1073,7 +1058,8 @@ parseCmmFile dflags filename = do
         let msg = mkPlainErrMsg dflags span err
         return ((emptyBag, unitBag msg), Nothing)
     POk pst code -> do
-        cmm <- initC dflags no_module (getCmm (unEC code initEnv [] >> return ()))
+        st <- initC
+        let (cmm,_) = runC dflags no_module st (getCmm (unEC code initEnv [] >> return ()))
         let ms = getMessages pst
         if (errorsFound dflags ms)
          then return (ms, Nothing)
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index 7361bbf385..bb8d5b2f22 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -11,25 +11,23 @@ module CmmPipeline (
 
 import CLabel
 import Cmm
-import CmmLive
+import CmmLint
 import CmmBuildInfoTables
 import CmmCommonBlockElim
 import CmmProcPoint
-import CmmSpillReload
-import CmmRewriteAssignments
-import CmmStackLayout
 import CmmContFlowOpt
-import OptimizationFuel
+import CmmLayoutStack
 
+import UniqSupply
 import DynFlags
 import ErrUtils
 import HscTypes
 import Data.Maybe
 import Control.Monad
-import Data.Map (Map)
-import qualified Data.Map as Map
 import Outputable
-import StaticFlags
+
+import qualified Data.Set as Set
+import Data.Map (Map)
 
 -----------------------------------------------------------------------------
 -- | Top level driver for C-- pipeline
@@ -53,32 +51,28 @@ import StaticFlags
 --    we actually need to do the initial pass.
 cmmPipeline  :: HscEnv -- Compilation env including
                        -- dynamic flags: -dcmm-lint -ddump-cps-cmm
-             -> (TopSRT, [CmmGroup])    -- SRT table and accumulating list of compiled procs
+             -> TopSRT     -- SRT table and accumulating list of compiled procs
              -> CmmGroup             -- Input C-- with Procedures
-             -> IO (TopSRT, [CmmGroup]) -- Output CPS transformed C--
-cmmPipeline hsc_env (topSRT, rst) prog =
+             -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
+cmmPipeline hsc_env topSRT prog =
   do let dflags = hsc_dflags hsc_env
      --
      showPass dflags "CPSZ"
 
-     let tops = runCmmContFlowOpts prog
-     (cafEnvs, tops) <- liftM unzip $ mapM (cpsTop hsc_env) tops
+     (cafEnvs, tops) <- {-# SCC "tops" #-} liftM unzip $ mapM (cpsTop hsc_env) prog
      -- tops :: [[(CmmDecl,CAFSet]]  (one list per group)
 
-     let topCAFEnv = mkTopCAFInfo (concat cafEnvs)
+     let topCAFEnv = {-# SCC "topCAFEnv" #-} mkTopCAFInfo (concat cafEnvs)
 
      -- folding over the groups
-     (topSRT, tops) <- foldM (toTops hsc_env topCAFEnv) (topSRT, []) tops
+     (topSRT, tops) <- {-# SCC "toTops" #-} foldM (toTops topCAFEnv) (topSRT, []) tops
 
      let cmms :: CmmGroup
          cmms = reverse (concat tops)
 
      dumpIfSet_dyn dflags Opt_D_dump_cps_cmm "Post CPS Cmm" (ppr cmms)
 
-     -- SRT is not affected by control flow optimization pass
-     let prog' = runCmmContFlowOpts cmms
-
-     return (topSRT, prog' : rst)
+     return (topSRT, cmms)
 
 {- [Note global fuel]
 ~~~~~~~~~~~~~~~~~~~~~
@@ -92,105 +86,110 @@ global to one compiler session.
 -- -ddump-cmmz
 
 cpsTop :: HscEnv -> CmmDecl -> IO ([(CLabel, CAFSet)], [(CAFSet, CmmDecl)])
-cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
+cpsTop _ p@(CmmData {}) = return ([], [(Set.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
-       -- Why bother doing these early: dualLivenessWithInsertion,
-       -- insertLateReloads, rewriteAssignments?
+       ----------- Control-flow optimisations ---------------
+       g <- {-# SCC "cmmCfgOpts(1)" #-} return $ cmmCfgOpts g
+       dump Opt_D_dump_cmmz_cfg "Post control-flow optimsations" g
 
        ----------- Eliminate common blocks -------------------
-       g <- return $ elimCommonBlocks g
+       g <- {-# SCC "elimCommonBlocks" #-} return $ elimCommonBlocks g
        dump Opt_D_dump_cmmz_cbe "Post common block elimination" g
-       -- Any work storing block Labels must be performed _after_ elimCommonBlocks
+       -- Any work storing block Labels must be performed _after_
+       -- elimCommonBlocks
 
        ----------- Proc points -------------------
-       let callPPs = callProcPoints g
-       procPoints <- run $ minimalProcPointSet (targetPlatform dflags) callPPs g
-       g <- run $ addProcPointProtocols callPPs procPoints g
-       dump Opt_D_dump_cmmz_proc "Post Proc Points Added" g
-
-       ----------- Spills and reloads -------------------
-       g <- run $ dualLivenessWithInsertion procPoints g
-       dump Opt_D_dump_cmmz_spills "Post spills and reloads" g
-
-       ----------- Sink and inline assignments -------------------
-       g <- runOptimization $ rewriteAssignments platform g
-       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
-
-       ----------- Eliminate dead assignments -------------------
-       g <- runOptimization $ removeDeadAssignments g
-       dump Opt_D_dump_cmmz_dead "Post remove dead assignments" g
-
-       ----------- Zero dead stack slots (Debug only) ---------------
-       -- Debugging: stubbing slots on death can cause crashes early
-       g <- if opt_StubDeadValues
-                then run $ stubSlotsOnDeath g
-                else return g
-       dump Opt_D_dump_cmmz_stub "Post stub dead stack slots" g
-
-       --------------- Stack layout ----------------
-       slotEnv <- run $ liveSlotAnal g
-       let spEntryMap = getSpEntryMap entry_off g
-       mbpprTrace "live slot analysis results: " (ppr slotEnv) $ return ()
-       let areaMap = layout procPoints spEntryMap slotEnv entry_off g
-       mbpprTrace "areaMap" (ppr areaMap) $ return ()
-
-       ------------  Manifest the stack pointer --------
-       g  <- run $ manifestSP spEntryMap areaMap entry_off g
-       dump Opt_D_dump_cmmz_sp "Post manifestSP" g
-       -- UGH... manifestSP can require updates to the procPointMap.
-       -- We can probably do something quicker here for the update...
+       let callPPs = {-# SCC "callProcPoints" #-} callProcPoints g
+       procPoints <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
+                     minimalProcPointSet (targetPlatform dflags) callPPs g
+
+       ----------- Layout the stack and manifest Sp ---------------
+       -- (also does: removeDeadAssignments, and lowerSafeForeignCalls)
+       (g, stackmaps) <- {-# SCC "layoutStack" #-}
+                         runUniqSM $ cmmLayoutStack procPoints entry_off g
+       dump Opt_D_dump_cmmz_sp "Layout Stack" g
+
+--       g <- {-# SCC "sink" #-} runUniqSM $ cmmSink g
+--       dump Opt_D_dump_cmmz_rewrite "Sink assignments" g
+
+--       ----------- Sink and inline assignments -------------------
+--       g <- {-# SCC "rewriteAssignments" #-} runOptimization $
+--            rewriteAssignments platform g
+--       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
 
        ------------- Split into separate procedures ------------
-       procPointMap  <- run $ procPointAnalysis procPoints g
-       dump Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
-       gs <- run $ splitAtProcPoints l callPPs procPoints procPointMap
-                                       (CmmProc h l g)
-       mapM_ (dump Opt_D_dump_cmmz_split "Post splitting") gs
-
-       ------------- More CAFs and foreign calls ------------
-       cafEnv <- run $ cafAnal g
-       let localCAFs = catMaybes $ map (localCAFInfo cafEnv) gs
+       procPointMap  <- {-# SCC "procPointAnalysis" #-} runUniqSM $
+                        procPointAnalysis procPoints g
+       dumpWith dflags Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
+       gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
+             splitAtProcPoints l callPPs procPoints procPointMap (CmmProc h l g)
+       dumps Opt_D_dump_cmmz_split "Post splitting" gs
+
+       ------------- More CAFs ------------------------------
+       let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
+       let localCAFs = {-# SCC "localCAFs" #-} catMaybes $ map (localCAFInfo cafEnv) gs
        mbpprTrace "localCAFs" (ppr localCAFs) $ return ()
 
-       gs <- run $ mapM (lowerSafeForeignCalls areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_lower "Post lowerSafeForeignCalls") gs
-
        -- NO MORE GRAPH TRANSFORMATION AFTER HERE -- JUST MAKING INFOTABLES
-       gs <- return $ map (setInfoTableStackMap slotEnv areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_info "after setInfoTableStackMap") gs
-       gs <- return $ map (bundleCAFs cafEnv) gs
-       mapM_ (dump Opt_D_dump_cmmz_cafs "after bundleCAFs") gs
+       gs <- {-# SCC "setInfoTableStackMap" #-}
+             return $ map (setInfoTableStackMap stackmaps) gs
+       dumps Opt_D_dump_cmmz_info "after setInfoTableStackMap" gs
+
+       ----------- Control-flow optimisations ---------------
+       gs <- {-# SCC "cmmCfgOpts(2)" #-} return $ map cmmCfgOptsProc gs
+       dumps Opt_D_dump_cmmz_cfg "Post control-flow optimsations" gs
+
+       gs <- {-# SCC "bundleCAFs" #-} return $ map (bundleCAFs cafEnv) gs
+       dumps Opt_D_dump_cmmz_cafs "after bundleCAFs" gs
+
        return (localCAFs, gs)
 
               -- gs        :: [ (CAFSet, CmmDecl) ]
               -- localCAFs :: [ (CLabel, CAFSet) ] -- statics filtered out(?)
 
   where dflags = hsc_dflags hsc_env
-        platform = targetPlatform dflags
-        mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
-        dump f = dumpWith ppr f
-        dumpWith pprFun f txt g = do
-            -- ToDo: No easy way of say "dump all the cmmz, *and* split
-            -- them into files."  Also, -ddump-cmmz doesn't play nicely
-            -- with -ddump-to-file, since the headers get omitted.
-            dumpIfSet_dyn dflags f txt (pprFun g)
-            when (not (dopt f dflags)) $
-                dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (pprFun g)
-        -- Runs a required transformation/analysis
-        run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
-        -- Runs an optional transformation/analysis (and should
-        -- thus be subject to optimization fuel)
-        runOptimization = runFuelIO (hsc_OptFuel hsc_env)
+        mbpprTrace x y z | dopt Opt_D_dump_cmmz dflags = pprTrace x y z
+                         | otherwise = z
+        dump = dumpGraph dflags
+
+        dumps flag name
+           = mapM_ (dumpWith dflags flag name)
+
+runUniqSM :: UniqSM a -> IO a
+runUniqSM m = do
+  us <- mkSplitUniqSupply 'u'
+  return (initUs_ us m)
+
+
+dumpGraph :: DynFlags -> DynFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+  when (dopt Opt_DoCmmLinting dflags) $ do_lint g
+  dumpWith dflags flag name g
+ where
+  do_lint g = case cmmLintGraph g of
+                 Just err -> do { fatalErrorMsg dflags err
+                                ; ghcExit dflags 1
+                                }
+                 Nothing  -> return ()
+
+dumpWith :: Outputable a => DynFlags -> DynFlag -> String -> a -> IO ()
+dumpWith dflags flag txt g = do
+         -- ToDo: No easy way of say "dump all the cmmz, *and* split
+         -- them into files."  Also, -ddump-cmmz doesn't play nicely
+         -- with -ddump-to-file, since the headers get omitted.
+   dumpIfSet_dyn dflags flag txt (ppr g)
+   when (not (dopt flag dflags)) $
+      dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (ppr g)
 
 -- This probably belongs in CmmBuildInfoTables?
 -- We're just finishing the job here: once we know what CAFs are defined
 -- in non-static closures, we can build the SRTs.
-toTops :: HscEnv -> Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
-                 -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
-toTops hsc_env topCAFEnv (topSRT, tops) gs =
+toTops :: Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
+       -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
+toTops topCAFEnv (topSRT, tops) gs =
   do let setSRT (topSRT, rst) g =
            do (topSRT, gs) <- setInfoTableSRT topCAFEnv topSRT g
               return (topSRT, gs : rst)
-     (topSRT, gs') <- runFuelIO (hsc_OptFuel hsc_env) $ foldM setSRT (topSRT, []) gs
+     (topSRT, gs') <- runUniqSM $ foldM setSRT (topSRT, []) gs
      return (topSRT, concat gs' : tops)
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index f50d850b3a..ebe40d9c9e 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -4,7 +4,7 @@
 module CmmProcPoint
     ( ProcPointSet, Status(..)
     , callProcPoints, minimalProcPointSet
-    , addProcPointProtocols, splitAtProcPoints, procPointAnalysis
+    , splitAtProcPoints, procPointAnalysis
     )
 where
 
@@ -13,22 +13,17 @@ import Prelude hiding (last, unzip, succ, zip)
 import BlockId
 import CLabel
 import Cmm
+import PprCmm ()
 import CmmUtils
-import CmmContFlowOpt
 import CmmInfo
-import CmmLive
-import Constants
 import Data.List (sortBy)
 import Maybes
-import MkGraph
 import Control.Monad
-import OptimizationFuel
 import Outputable
 import Platform
-import UniqSet
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import qualified Data.Map as Map
 
@@ -103,34 +98,50 @@ instance Outputable Status where
                     (hsep $ punctuate comma $ map ppr $ setElems ps)
   ppr ProcPoint = text "<procpt>"
 
-lattice :: DataflowLattice Status
-lattice = DataflowLattice "direct proc-point reachability" unreached add_to
-    where unreached = ReachedBy setEmpty
-          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
-          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint) -- because of previous case
-          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p')) =
-              let union = setUnion p' p
-              in  if setSize union > setSize p then (SomeChange, ReachedBy union)
-                                               else (NoChange, ReachedBy p)
 --------------------------------------------------
+-- Proc point analysis
+
+procPointAnalysis :: ProcPointSet -> CmmGraph -> UniqSM (BlockEnv Status)
+-- Once you know what the proc-points are, figure out
+-- what proc-points each block is reachable from
+procPointAnalysis procPoints g =
+  -- pprTrace "procPointAnalysis" (ppr procPoints) $
+  dataflowAnalFwdBlocks g initProcPoints $ analFwd lattice forward
+  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+
 -- transfer equations
 
 forward :: FwdTransfer CmmNode Status
-forward = mkFTransfer3 first middle ((mkFactBase lattice . ) . last)
-    where first :: CmmNode C O -> Status -> Status
-          first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
-          first  _ x = x
+forward = mkFTransfer3 first middle last
+    where
+      first :: CmmNode C O -> Status -> Status
+      first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
+      first  _ x = x
 
-          middle _ x = x
+      middle _ x = x
 
-          last :: CmmNode O C -> Status -> [(Label, Status)]
-          last (CmmCall {cml_cont = Just k}) _ = [(k, ProcPoint)]
-          last (CmmForeignCall {succ = k})   _ = [(k, ProcPoint)]
-          last l x = map (\id -> (id, x)) (successors l)
+      last :: CmmNode O C -> Status -> FactBase Status
+      last l x = mkFactBase lattice $ map (\id -> (id, x)) (successors l)
 
--- It is worth distinguishing two sets of proc points:
--- those that are induced by calls in the original graph
--- and those that are introduced because they're reachable from multiple proc points.
+lattice :: DataflowLattice Status
+lattice = DataflowLattice "direct proc-point reachability" unreached add_to
+    where unreached = ReachedBy setEmpty
+          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
+          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint)
+                       -- because of previous case
+          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p'))
+             | setSize union > setSize p = (SomeChange, ReachedBy union)
+             | otherwise                 = (NoChange, ReachedBy p)
+           where
+             union = setUnion p' p
+
+----------------------------------------------------------------------
+
+-- It is worth distinguishing two sets of proc points: those that are
+-- induced by calls in the original graph and those that are
+-- introduced because they're reachable from multiple proc points.
+--
+-- Extract the set of Continuation BlockIds, see Note [Continuation BlockIds].
 callProcPoints      :: CmmGraph -> ProcPointSet
 callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
   where add :: CmmBlock -> BlockSet -> BlockSet
@@ -139,21 +150,17 @@ callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
                       CmmForeignCall {succ=k}     -> setInsert k set
                       _ -> set
 
-minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph -> FuelUniqSM ProcPointSet
+minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
+                    -> UniqSM ProcPointSet
 -- Given the set of successors of calls (which must be proc-points)
 -- figure out the minimal set of necessary proc-points
-minimalProcPointSet platform callProcPoints g = extendPPSet platform g (postorderDfs g) callProcPoints
-
-procPointAnalysis :: ProcPointSet -> CmmGraph -> FuelUniqSM (BlockEnv Status)
--- Once you know what the proc-points are, figure out
--- what proc-points each block is reachable from
-procPointAnalysis procPoints g =
-  liftM snd $ dataflowPassFwd g initProcPoints $ analFwd lattice forward
-  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+minimalProcPointSet platform callProcPoints g
+  = extendPPSet platform g (postorderDfs g) callProcPoints
 
-extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> FuelUniqSM ProcPointSet
+extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> UniqSM ProcPointSet
 extendPPSet platform g blocks procPoints =
     do env <- procPointAnalysis procPoints g
+       -- pprTrace "extensPPSet" (ppr env) $ return ()
        let add block pps = let id = entryLabel block
                            in  case mapLookup id env of
                                  Just ProcPoint -> setInsert id pps
@@ -179,183 +186,13 @@ extendPPSet platform g blocks procPoints =
            pps -> extendPPSet g blocks
                     (foldl extendBlockSet procPoints' pps)
 -}
-       case newPoint of Just id ->
-                          if setMember id procPoints' then panic "added old proc pt"
-                          else extendPPSet platform g blocks (setInsert id procPoints')
-                        Nothing -> return procPoints'
-
-
-------------------------------------------------------------------------
---                    Computing Proc-Point Protocols                  --
-------------------------------------------------------------------------
-
-{-
-
-There is one major trick, discovered by Michael Adams, which is that
-we want to choose protocols in a way that enables us to optimize away
-some continuations.  The optimization is very much like branch-chain
-elimination, except that it involves passing results as well as
-control.  The idea is that if a call's continuation k does nothing but
-CopyIn its results and then goto proc point P, the call's continuation
-may be changed to P, *provided* P's protocol is identical to the
-protocol for the CopyIn.  We choose protocols to make this so.
-
-Here's an explanatory example; we begin with the source code (lines
-separate basic blocks):
-
-  ..1..;
-  x, y = g();
-  goto P;
-  -------
-  P: ..2..;
-
-Zipperization converts this code as follows:
-
-  ..1..;
-  call g() returns to k;
-  -------
-  k: CopyIn(x, y);
-     goto P;
-  -------
-  P: ..2..;
-
-What we'd like to do is assign P the same CopyIn protocol as k, so we
-can eliminate k:
-
-  ..1..;
-  call g() returns to P;
-  -------
-  P: CopyIn(x, y); ..2..;
-
-Of course, P may be the target of more than one continuation, and
-different continuations may have different protocols.  Michael Adams
-implemented a voting mechanism, but he thinks a simple greedy
-algorithm would be just as good, so that's what we do.
-
--}
+       case newPoint of
+         Just id ->
+             if setMember id procPoints'
+                then panic "added old proc pt"
+                else extendPPSet platform g blocks (setInsert id procPoints')
+         Nothing -> return procPoints'
 
-data Protocol = Protocol Convention [CmmFormal] Area
-  deriving Eq
-instance Outputable Protocol where
-  ppr (Protocol c fs a) = text "Protocol" <+> ppr c <+> ppr fs <+> ppr a
-
--- | Function 'optimize_calls' chooses protocols only for those proc
--- points that are relevant to the optimization explained above.
--- The others are assigned by 'add_unassigned', which is not yet clever.
-
-addProcPointProtocols :: ProcPointSet -> ProcPointSet -> CmmGraph -> FuelUniqSM CmmGraph
-addProcPointProtocols callPPs procPoints g =
-  do liveness <- cmmLiveness g
-     (protos, g') <- optimize_calls liveness g
-     blocks'' <- add_CopyOuts protos procPoints g'
-     return $ ofBlockMap (g_entry g) blocks''
-    where optimize_calls liveness g =  -- see Note [Separate Adams optimization]
-            do let (protos, blocks') =
-                       foldGraphBlocks maybe_add_call (mapEmpty, mapEmpty) g
-                   protos' = add_unassigned liveness procPoints protos
-               let g' = ofBlockMap (g_entry g) (add_CopyIns callPPs protos' blocks')
-               return (protos', removeUnreachableBlocks g')
-          maybe_add_call :: CmmBlock -> (BlockEnv Protocol, BlockEnv CmmBlock)
-                         -> (BlockEnv Protocol, BlockEnv CmmBlock)
-          -- ^ If the block is a call whose continuation goes to a proc point
-          -- whose protocol either matches the continuation's or is not yet set,
-          -- redirect the call (cf 'newblock') and set the protocol if necessary
-          maybe_add_call block (protos, blocks) =
-              case lastNode block of
-                CmmCall tgt (Just k) args res s
-                    | Just proto <- mapLookup k protos,
-                      Just pee   <- branchesToProcPoint k
-                    -> let newblock = replaceLastNode block (CmmCall tgt (Just pee)
-                                                                     args res s)
-                           changed_blocks   = insertBlock newblock blocks
-                           unchanged_blocks = insertBlock block    blocks
-                       in case mapLookup pee protos of
-                            Nothing -> (mapInsert pee proto protos, changed_blocks)
-                            Just proto' ->
-                              if proto == proto' then (protos, changed_blocks)
-                              else (protos, unchanged_blocks)
-                _ -> (protos, insertBlock block blocks)
-
-          branchesToProcPoint :: BlockId -> Maybe BlockId
-          -- ^ Tells whether the named block is just a branch to a proc point
-          branchesToProcPoint id =
-              let block = mapLookup id (toBlockMap g) `orElse`
-                                    panic "branch out of graph"
-              in case blockToNodeList block of
-                   (_, [], JustC (CmmBranch pee)) | setMember pee procPoints -> Just pee
-                   _                                                         -> Nothing
-
--- | For now, following a suggestion by Ben Lippmeier, we pass all
--- live variables as arguments, hoping that a clever register
--- allocator might help.
-
-add_unassigned :: BlockEnv CmmLive -> ProcPointSet -> BlockEnv Protocol ->
-                  BlockEnv Protocol
-add_unassigned = pass_live_vars_as_args
-
-pass_live_vars_as_args :: BlockEnv CmmLive -> ProcPointSet ->
-                          BlockEnv Protocol -> BlockEnv Protocol
-pass_live_vars_as_args _liveness procPoints protos = protos'
-  where protos' = setFold addLiveVars protos procPoints
-        addLiveVars :: BlockId -> BlockEnv Protocol -> BlockEnv Protocol
-        addLiveVars id protos =
-            case mapLookup id protos of
-              Just _  -> protos
-              Nothing -> let live = emptyRegSet
-                                    --lookupBlockEnv _liveness id `orElse`
-                                    --panic ("no liveness at block " ++ show id)
-                             formals = uniqSetToList live
-                             prot = Protocol Private formals $ CallArea $ Young id
-                         in  mapInsert id prot protos
-
-
--- | Add copy-in instructions to each proc point that did not arise from a call
--- instruction. (Proc-points that arise from calls already have their copy-in instructions.)
-
-add_CopyIns :: ProcPointSet -> BlockEnv Protocol -> BlockEnv CmmBlock -> BlockEnv CmmBlock
-add_CopyIns callPPs protos blocks = mapFold maybe_insert_CopyIns mapEmpty blocks
-    where maybe_insert_CopyIns block blocks
-             | not $ setMember bid callPPs
-             , Just (Protocol c fs _area) <- mapLookup bid protos
-             = let nodes     = copyInSlot c fs
-                   (h, m, l) = blockToNodeList block
-               in insertBlock (blockOfNodeList (h, nodes ++ m, l)) blocks
-             | otherwise = insertBlock block blocks
-           where bid = entryLabel block
-
-
--- | Add a CopyOut node before each procpoint.
--- If the predecessor is a call, then the copy outs should already be done by the callee.
--- Note: If we need to add copy-out instructions, they may require stack space,
--- so we accumulate a map from the successors to the necessary stack space,
--- then update the successors after we have finished inserting the copy-outs.
-
-add_CopyOuts :: BlockEnv Protocol -> ProcPointSet -> CmmGraph ->
-                FuelUniqSM (BlockEnv CmmBlock)
-add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty) g
-    where mb_copy_out :: CmmBlock -> FuelUniqSM (BlockEnv CmmBlock) ->
-                                     FuelUniqSM (BlockEnv CmmBlock)
-          mb_copy_out b z | entryLabel b == g_entry g = skip b z
-          mb_copy_out b z =
-            case lastNode b of
-              CmmCall {}        -> skip b z -- copy out done by callee
-              CmmForeignCall {} -> skip b z -- copy out done by callee
-              _ -> copy_out b z
-          copy_out b z = foldr trySucc init (successors b) >>= finish
-            where init = (\bmap -> (b, bmap)) `liftM` z
-                  trySucc succId z =
-                    if setMember succId procPoints then
-                      case mapLookup succId protos of
-                        Nothing -> z
-                        Just (Protocol c fs _area) -> insert z succId $ copyOutSlot c fs
-                    else z
-                  insert z succId m =
-                    do (b, bmap) <- z
-                       (b, bs)   <- insertBetween b m succId
-                       -- pprTrace "insert for succ" (ppr succId <> ppr m) $ do
-                       return $ (b, foldl (flip insertBlock) bmap bs)
-                  finish (b, bmap) = return $ insertBlock b bmap
-          skip b bs = insertBlock b `liftM` bs
 
 -- At this point, we have found a set of procpoints, each of which should be
 -- the entry point of a procedure.
@@ -370,10 +207,9 @@ add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty)
 -- ToDo: use the _ret naming convention that the old code generator
 -- used. -- EZY
 splitAtProcPoints :: CLabel -> ProcPointSet-> ProcPointSet -> BlockEnv Status ->
-                     CmmDecl -> FuelUniqSM [CmmDecl]
+                     CmmDecl -> UniqSM [CmmDecl]
 splitAtProcPoints entry_label callPPs procPoints procMap
-                  (CmmProc (TopInfo {info_tbl=info_tbl,
-                                     stack_info=stack_info})
+                  (CmmProc (TopInfo {info_tbl=info_tbl})
                            top_l g@(CmmGraph {g_entry=entry})) =
   do -- Build a map from procpoints to the blocks they reach
      let addBlock b graphEnv =
@@ -384,15 +220,18 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  []   -> graphEnv
                  [id] -> add graphEnv id bid b 
                  _    -> panic "Each block should be reachable from only one ProcPoint"
-             Nothing -> pprPanic "block not reached by a proc point?" (ppr bid)
+             Nothing -> graphEnv
            where bid = entryLabel b
          add graphEnv procId bid b = mapInsert procId graph' graphEnv
                where graph  = mapLookup procId graphEnv `orElse` mapEmpty
                      graph' = mapInsert bid b graph
+
      graphEnv <- return $ foldGraphBlocks addBlock emptyBlockMap g
+
      -- Build a map from proc point BlockId to pairs of:
      --  * Labels for their new procedures
-     --  * Labels for the info tables of their new procedures (only if the proc point is a callPP)
+     --  * Labels for the info tables of their new procedures (only if
+     --    the proc point is a callPP)
      -- Due to common blockification, we may overestimate the set of procpoints.
      let add_label map pp = Map.insert pp lbls map
            where lbls | pp == entry = (entry_label, Just entry_info_lbl)
@@ -401,30 +240,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  entry_info_lbl = cit_lbl info_tbl
          procLabels = foldl add_label Map.empty
                             (filter (flip mapMember (toBlockMap g)) (setElems procPoints))
-     -- For each procpoint, we need to know the SP offset on entry.
-     -- If the procpoint is:
-     --  - continuation of a call, the SP offset is in the call
-     --  - otherwise, 0 (and left out of the spEntryMap)
-     let add_sp_off :: CmmBlock -> BlockEnv CmmStackInfo -> BlockEnv CmmStackInfo
-         add_sp_off b env =
-           case lastNode b of
-             CmmCall {cml_cont = Just succ, cml_ret_args = off, cml_ret_off = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = off, updfr_space = Just updfr_off}) env
-             CmmForeignCall {succ = succ, updfr = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = wORD_SIZE, updfr_space = Just updfr_off}) env
-             _ -> env
-         spEntryMap = foldGraphBlocks add_sp_off (mapInsert entry stack_info emptyBlockMap) g
-         getStackInfo id = mapLookup id spEntryMap `orElse` StackInfo {arg_space = 0, updfr_space = Nothing}
      -- In each new graph, add blocks jumping off to the new procedures,
      -- and replace branches to procpoints with branches to the jump-off blocks
      let add_jump_block (env, bs) (pp, l) =
            do bid <- liftM mkBlockId getUniqueM
-              let b = blockOfNodeList (JustC (CmmEntry bid), [], JustC jump)
-                  StackInfo {arg_space = argSpace, updfr_space = off} = getStackInfo pp
-                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing argSpace 0
-                                 (off `orElse` 0) -- Jump's shouldn't need the offset...
+              let b = blockJoin (CmmEntry bid) emptyBlock jump
+                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing [{-XXX-}] 0 0 0
+                  -- XXX: No regs are live at the call
               return (mapInsert pp bid env, b : bs)
-         add_jumps (newGraphEnv) (ppId, blockEnv) =
+
+         add_jumps newGraphEnv (ppId, blockEnv) =
            do let needed_jumps = -- find which procpoints we currently branch to
                     mapFold add_if_branch_to_pp [] blockEnv
                   add_if_branch_to_pp :: CmmBlock -> [(BlockId, CLabel)] -> [(BlockId, CLabel)]
@@ -441,17 +266,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  foldM add_jump_block (mapEmpty, []) needed_jumps
                   -- update the entry block
               let b = expectJust "block in env" $ mapLookup ppId blockEnv
-                  off = getStackInfo ppId
                   blockEnv' = mapInsert ppId b blockEnv
                   -- replace branches to procpoints with branches to jumps
                   blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
                   -- add the jump blocks to the graph
                   blockEnv''' = foldl (flip insertBlock) blockEnv'' jumpBlocks
-              let g' = (off, ofBlockMap ppId blockEnv''')
+              let g' = ofBlockMap ppId blockEnv'''
               -- pprTrace "g' pre jumps" (ppr g') $ do
               return (mapInsert ppId g' newGraphEnv)
      graphEnv <- foldM add_jumps emptyBlockMap $ mapToList graphEnv
-     let to_proc (bid, (stack_info, g)) = case expectJust "pp label" $ Map.lookup bid procLabels of
+     let to_proc (bid, g) = case expectJust "pp label" $ Map.lookup bid procLabels of
              (lbl, Just info_lbl)
                | bid == entry
                -> CmmProc (TopInfo {info_tbl=info_tbl, stack_info=stack_info})
@@ -462,15 +286,23 @@ splitAtProcPoints entry_label callPPs procPoints procMap
              (lbl, Nothing)
                -> CmmProc (TopInfo {info_tbl=CmmNonInfoTable, stack_info=stack_info})
                           lbl (replacePPIds g)
-         -- References to procpoint IDs can now be replaced with the infotable's label
-         replacePPIds g = mapGraphNodes (id, mapExp repl, mapExp repl) g
+            where
+             stack_info = StackInfo 0 Nothing -- panic "No StackInfo"
+                          -- cannot use panic, this is printed by -ddump-cmmz
+
+         -- References to procpoint IDs can now be replaced with the
+         -- infotable's label
+         replacePPIds g = {-# SCC "replacePPIds" #-}
+                          mapGraphNodes (id, mapExp repl, mapExp repl) g
            where repl e@(CmmLit (CmmBlock bid)) =
                    case Map.lookup bid procLabels of
                      Just (_, Just info_lbl)  -> CmmLit (CmmLabel info_lbl)
                      _ -> e
                  repl e = e
-     -- The C back end expects to see return continuations before the call sites.
-     -- Here, we sort them in reverse order -- it gets reversed later.
+
+     -- The C back end expects to see return continuations before the
+     -- call sites.  Here, we sort them in reverse order -- it gets
+     -- reversed later.
      let (_, block_order) = foldl add_block_num (0::Int, emptyBlockMap) (postorderDfs g)
          add_block_num (i, map) block = (i+1, mapInsert (entryLabel block) i map)
          sort_fn (bid, _) (bid', _) =
@@ -482,6 +314,27 @@ splitAtProcPoints entry_label callPPs procPoints procMap
             procs
 splitAtProcPoints _ _ _ _ t@(CmmData _ _) = return [t]
 
+
+-- Only called from CmmProcPoint.splitAtProcPoints. NB. does a
+-- recursive lookup, see comment below.
+replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
+replaceBranches env cmmg
+  = {-# SCC "replaceBranches" #-}
+    ofBlockMap (g_entry cmmg) $ mapMap f $ toBlockMap cmmg
+  where
+    f block = replaceLastNode block $ last (lastNode block)
+
+    last :: CmmNode O C -> CmmNode O C
+    last (CmmBranch id)          = CmmBranch (lookup id)
+    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
+    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
+    last l@(CmmCall {})          = l
+    last l@(CmmForeignCall {})   = l
+    lookup id = fmap lookup (mapLookup id env) `orElse` id
+            -- XXX: this is a recursive lookup, it follows chains
+            -- until the lookup returns Nothing, at which point we
+            -- return the last BlockId
+
 ----------------------------------------------------------------
 
 {-
diff --git a/compiler/cmm/CmmRewriteAssignments.hs b/compiler/cmm/CmmRewriteAssignments.hs
index ecf3f7e0c3..2f13997771 100644
--- a/compiler/cmm/CmmRewriteAssignments.hs
+++ b/compiler/cmm/CmmRewriteAssignments.hs
@@ -18,23 +18,23 @@ module CmmRewriteAssignments
 import Cmm
 import CmmUtils
 import CmmOpt
-import OptimizationFuel
 import StgCmmUtils
 
-import Control.Monad
+import UniqSupply
 import Platform
 import UniqFM
 import Unique
 import BlockId
 
-import Compiler.Hoopl hiding (Unique)
+import Hoopl
 import Data.Maybe
+import Control.Monad
 import Prelude hiding (succ, zip)
 
 ----------------------------------------------------------------
 --- Main function
 
-rewriteAssignments :: Platform -> CmmGraph -> FuelUniqSM CmmGraph
+rewriteAssignments :: Platform -> CmmGraph -> UniqSM CmmGraph
 rewriteAssignments platform g = do
   -- Because we need to act on forwards and backwards information, we
   -- first perform usage analysis and bake this information into the
@@ -213,7 +213,7 @@ usageTransfer = mkBTransfer3 first middle last
           increaseUsage f r = addToUFM_C combine f r SingleUse
             where combine _ _ = ManyUse
 
-usageRewrite :: BwdRewrite FuelUniqSM (WithRegUsage CmmNode) UsageMap
+usageRewrite :: BwdRewrite UniqSM (WithRegUsage CmmNode) UsageMap
 usageRewrite = mkBRewrite3 first middle last
     where first  _ _ = return Nothing
           middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))
@@ -226,7 +226,7 @@ usageRewrite = mkBRewrite3 first middle last
           last   _ _ = return Nothing
 
 type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode)
-annotateUsage :: CmmGraph -> FuelUniqSM (CmmGraphWithRegUsage)
+annotateUsage :: CmmGraph -> UniqSM (CmmGraphWithRegUsage)
 annotateUsage vanilla_g =
     let g = modifyGraph liftRegUsage vanilla_g
     in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $
@@ -404,8 +404,8 @@ clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
 -- ToDo: Also catch MachOp case
 clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
     | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
-clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (CallArea a') o') t)
+clobbers (CmmStackSlot a o, rhs) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot a' o') t)
             = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
           f (CmmLoad e _)    = containsStackSlot e
           f (CmmMachOp _ es) = or (map f es)
@@ -416,9 +416,6 @@ clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
           containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
           containsStackSlot (CmmStackSlot{}) = True
           containsStackSlot _ = False
-clobbers (CmmStackSlot (RegSlot l) _, _) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (RegSlot l') _) _) = l == l'
-          f _ = False
 clobbers _ (_, e) = f e
     where f (CmmLoad (CmmStackSlot _ _) _) = False
           f (CmmLoad{}) = True -- conservative
@@ -432,7 +429,7 @@ clobbers _ (_, e) = f e
 --      [ I32  ]
 --      [    F64     ]
 --      s'   -w'-    o'
-type CallSubArea = (AreaId, Int, Int) -- area, offset, width
+type CallSubArea = (Area, Int, Int) -- area, offset, width
 overlaps :: CallSubArea -> CallSubArea -> Bool
 overlaps (a, _, _) (a', _, _) | a /= a' = False
 overlaps (_, o, w) (_, o', w') =
@@ -441,7 +438,7 @@ overlaps (_, o, w) (_, o', w') =
     in (s' < o) && (s < o) -- Not LTE, because [ I32  ][ I32  ] is OK
 
 lastAssignment :: WithRegUsage CmmNode O C -> AssignmentMap -> [(Label, AssignmentMap)]
-lastAssignment (Plain (CmmCall _ (Just k) _ _ _)) assign = [(k, invalidateVolatile k assign)]
+lastAssignment (Plain (CmmCall _ (Just k) _ _ _ _)) assign = [(k, invalidateVolatile k assign)]
 lastAssignment (Plain (CmmForeignCall {succ=k}))  assign = [(k, invalidateVolatile k assign)]
 lastAssignment l assign = map (\id -> (id, deleteSinks l assign)) $ successors l
 
@@ -457,7 +454,7 @@ invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap
 invalidateVolatile k m = mapUFM p m
   where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize
             where exp CmmLit{} = True
-                  exp (CmmLoad (CmmStackSlot (CallArea (Young k')) _) _)
+                  exp (CmmLoad (CmmStackSlot (Young k') _) _)
                     | k' == k = False
                   exp (CmmLoad (CmmStackSlot _ _) _) = True
                   exp (CmmMachOp _ es) = and (map exp es)
@@ -527,7 +524,7 @@ assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase ass
 -- values from the assignment map, due to reassignment of the local
 -- register.)  This is probably not locally sound.
 
-assignmentRewrite :: FwdRewrite FuelUniqSM (WithRegUsage CmmNode) AssignmentMap
+assignmentRewrite :: FwdRewrite UniqSM (WithRegUsage CmmNode) AssignmentMap
 assignmentRewrite = mkFRewrite3 first middle last
     where
         first _ _ = return Nothing
@@ -596,10 +593,6 @@ assignmentRewrite = mkFRewrite3 first middle last
                           where rep = typeWidth (localRegType r)
               _ -> old
         -- See Note [Soundness of store rewriting]
-        inlineExp assign old@(CmmLoad (CmmStackSlot (RegSlot r) _) _)
-          = case lookupUFM assign r of
-              Just (AlwaysInline x) -> x
-              _ -> old
         inlineExp _ old = old
 
         inlinable :: CmmNode e x -> Bool
@@ -612,7 +605,7 @@ assignmentRewrite = mkFRewrite3 first middle last
 -- in literals, which we can inline more aggressively, and inlining
 -- gives us opportunities for more folding.  However, we don't need any
 -- facts to do MachOp folding.
-machOpFoldRewrite :: Platform -> FwdRewrite FuelUniqSM (WithRegUsage CmmNode) a
+machOpFoldRewrite :: Platform -> FwdRewrite UniqSM (WithRegUsage CmmNode) a
 machOpFoldRewrite platform = mkFRewrite3 first middle last
   where first _ _ = return Nothing
         middle :: WithRegUsage CmmNode O O -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O O
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
deleted file mode 100644
index 9e762fe48a..0000000000
--- a/compiler/cmm/CmmSpillReload.hs
+++ /dev/null
@@ -1,166 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds, FlexibleContexts #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of this flag in due course
-
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
--- TODO: Get rid of this flag:
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
-module CmmSpillReload
-  ( dualLivenessWithInsertion
-  )
-where
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmLive
-import OptimizationFuel
-
-import Control.Monad
-import Outputable hiding (empty)
-import qualified Outputable as PP
-import UniqSet
-
-import Compiler.Hoopl hiding (Unique)
-import Data.Maybe
-import Prelude hiding (succ, zip)
-
-{- Note [Overview of spill/reload]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The point of this module is to insert spills and reloads to establish
-the invariant that at a call or any proc point with an established
-protocol all live variables not expected in registers are sitting on the
-stack.  We use a backward dual liveness analysis (both traditional
-register liveness as well as register slot liveness on the stack) to
-insert spills and reloads.  It should be followed by a forward
-transformation to sink reloads as deeply as possible, so as to reduce
-register pressure: this transformation is performed by
-CmmRewriteAssignments.
-
-A variable can be expected to be live in a register, live on the
-stack, or both.  This analysis ensures that spills and reloads are
-inserted as needed to make sure that every live variable needed
-after a call is available on the stack.  Spills are placed immediately
-after their reaching definitions, but reloads are placed immediately
-after a return from a call (the entry point.)
-
-Note that we offer no guarantees about the consistency of the value
-in memory and the value in the register, except that they are
-equal across calls/procpoints.  If the variable is changed, this
-mapping breaks: but as the original value of the register may still
-be useful in a different context, the memory location is not updated.
--}
-
-data DualLive = DualLive { on_stack :: RegSet, in_regs :: RegSet }
-
-changeStack, changeRegs :: (RegSet -> RegSet) -> DualLive -> DualLive
-changeStack f live = live { on_stack = f (on_stack live) }
-changeRegs  f live = live { in_regs  = f (in_regs  live) }
-
-dualLiveLattice :: DataflowLattice DualLive
-dualLiveLattice = DataflowLattice "variables live in registers and on stack" empty add
-    where empty = DualLive emptyRegSet emptyRegSet
-          add _ (OldFact old) (NewFact new) = (changeIf $ change1 || change2, DualLive stack regs)
-            where (change1, stack) = add1 (on_stack old) (on_stack new)
-                  (change2, regs)  = add1 (in_regs old)  (in_regs new)
-          add1 old new = if sizeUniqSet join > sizeUniqSet old then (True, join) else (False, old)
-            where join = unionUniqSets old new
-
-dualLivenessWithInsertion :: BlockSet -> CmmGraph -> FuelUniqSM CmmGraph
-dualLivenessWithInsertion procPoints g =
-  liftM fst $ dataflowPassBwd g [] $ analRewBwd dualLiveLattice
-                                                (dualLiveTransfers (g_entry g) procPoints)
-                                                (insertSpillsAndReloads g procPoints)
-
--- Note [Live registers on entry to procpoints]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- Remember that the transfer function is only ever run on the rewritten
--- version of a graph, and the rewrite function for spills and reloads
--- enforces the invariant that no local registers are live on entry to
--- a procpoint.  Accordingly, we check for this invariant here.  An old
--- version of this code incorrectly claimed that any live registers were
--- live on the stack before entering the function: this is wrong, but
--- didn't cause bugs because it never actually was invoked.
-
-dualLiveTransfers :: BlockId -> BlockSet -> (BwdTransfer CmmNode DualLive)
-dualLiveTransfers entry procPoints = mkBTransfer3 first middle last
-    where first :: CmmNode C O -> DualLive -> DualLive
-          first (CmmEntry id) live -- See Note [Live registers on entry to procpoints]
-            | id == entry || setMember id procPoints = noLiveOnEntry id (in_regs live) live
-            | otherwise                              = live
-
-          middle :: CmmNode O O -> DualLive -> DualLive
-          middle m = changeStack updSlots
-                   . changeRegs  updRegs
-            where -- Reuse middle of liveness analysis from CmmLive
-                  updRegs = case getBTransfer3 xferLive of (_, middle, _) -> middle m
-
-                  updSlots live = foldSlotsUsed reload (foldSlotsDefd spill live m) m
-                  spill  live s@(RegSlot r, _, _) = check s $ deleteFromRegSet live r
-                  spill  live _ = live
-                  reload live s@(RegSlot r, _, _) = check s $ extendRegSet live r
-                  reload live _ = live
-                  -- Ensure the assignment refers to the entirety of the
-                  -- register slot (and not just a slice).
-                  check (RegSlot (LocalReg _ ty), o, w) x
-                     | o == w && w == widthInBytes (typeWidth ty) = x
-                  check _ _ = panic "dualLiveTransfers: slices unsupported"
-
-          -- Register analysis is identical to liveness analysis from CmmLive.
-          last :: CmmNode O C -> FactBase DualLive -> DualLive
-          last l fb = changeRegs (gen_kill l) $ case l of
-            CmmCall {cml_cont=Nothing} -> empty
-            CmmCall {cml_cont=Just k}  -> keep_stack_only k
-            CmmForeignCall {succ=k}    -> keep_stack_only k
-            _                          -> joinOutFacts dualLiveLattice l fb
-            where empty = fact_bot dualLiveLattice
-                  lkp k = fromMaybe empty (lookupFact k fb)
-                  keep_stack_only k = DualLive (on_stack (lkp k)) emptyRegSet
-
-insertSpillsAndReloads :: CmmGraph -> BlockSet -> CmmBwdRewrite DualLive
-insertSpillsAndReloads graph procPoints = deepBwdRw3 first middle nothing
-  -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-  -- but GHC miscompiles it, see bug #4044.
-    where first :: CmmNode C O -> Fact O DualLive -> CmmReplGraph C O
-          first e@(CmmEntry id) live = return $
-            if id /= (g_entry graph) && setMember id procPoints then
-              case map reload (uniqSetToList (in_regs live)) of
-                [] -> Nothing
-                is -> Just $ mkFirst e <*> mkMiddles is
-            else Nothing
-          -- EZY: There was some dead code for handling the case where
-          -- we were not splitting procedures.  Check Git history if
-          -- you're interested (circa e26ea0f41).
-
-          middle :: CmmNode O O -> Fact O DualLive -> CmmReplGraph O O
-          -- Don't add spills next to reloads.
-          middle (CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg') _) _)) _ | reg == reg' = return Nothing
-          -- Spill if register is live on stack.
-          middle m@(CmmAssign (CmmLocal reg) _) live
-            | reg `elemRegSet` on_stack live = return (Just (mkMiddles [m, spill reg]))
-          middle _ _ = return Nothing
-
-          nothing _ _ = return Nothing
-
-spill, reload :: LocalReg -> CmmNode O O
-spill  r = CmmStore  (regSlot r) (CmmReg $ CmmLocal r)
-reload r = CmmAssign (CmmLocal r) (CmmLoad (regSlot r) $ localRegType r)
-
----------------------
--- prettyprinting
-
-ppr_regs :: String -> RegSet -> SDoc
-ppr_regs s regs = text s <+> commafy (map ppr $ uniqSetToList regs)
-  where commafy xs = hsep $ punctuate comma xs
-
-instance Outputable DualLive where
-  ppr (DualLive {in_regs = regs, on_stack = stack}) =
-      if isEmptyUniqSet regs && isEmptyUniqSet stack then
-          text "<nothing-live>"
-      else
-          nest 2 $ fsep [if isEmptyUniqSet regs then PP.empty
-                         else (ppr_regs "live in regs =" regs),
-                         if isEmptyUniqSet stack then PP.empty
-                         else (ppr_regs "live on stack =" stack)]
diff --git a/compiler/cmm/CmmStackLayout.hs b/compiler/cmm/CmmStackLayout.hs
deleted file mode 100644
index c7fedad05b..0000000000
--- a/compiler/cmm/CmmStackLayout.hs
+++ /dev/null
@@ -1,591 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of the NoMonoLocalBinds
--- extension in due course
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
--- Todo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-#if __GLASGOW_HASKELL__ >= 703
--- GHC 7.0.1 improved incomplete pattern warnings with GADTs
-{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
-#endif
-
-module CmmStackLayout
-    ( SlotEnv, liveSlotAnal, liveSlotTransfers, removeLiveSlotDefs
-    , getSpEntryMap, layout, manifestSP, igraph, areaBuilder
-    , stubSlotsOnDeath ) -- to help crash early during debugging
-where
-
-import Constants
-import Prelude hiding (succ, zip, unzip, last)
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmProcPoint
-import Maybes
-import MkGraph (stackStubExpr)
-import Control.Monad
-import OptimizationFuel
-import Outputable
-import SMRep (ByteOff)
-
-import Compiler.Hoopl
-
-import Data.Map (Map)
-import qualified Data.Map as Map
-import qualified FiniteMap as Map
-
-------------------------------------------------------------------------
---                    Stack Layout                                    --
-------------------------------------------------------------------------
-
--- | Before we lay out the stack, we need to know something about the
--- liveness of the stack slots. In particular, to decide whether we can
--- reuse a stack location to hold multiple stack slots, we need to know
--- when each of the stack slots is used.
--- Although tempted to use something simpler, we really need a full interference
--- graph. Consider the following case:
---   case <...> of
---     1 -> <spill x>; // y is dead out
---     2 -> <spill y>; // x is dead out
---     3 -> <spill x and y>
--- If we consider the arms in order and we use just the deadness information given by a
--- dataflow analysis, we might decide to allocate the stack slots for x and y
--- to the same stack location, which will lead to incorrect code in the third arm.
--- We won't make this mistake with an interference graph.
-
--- First, the liveness analysis.
--- We represent a slot with an area, an offset into the area, and a width.
--- Tracking the live slots is a bit tricky because there may be loads and stores
--- into only a part of a stack slot (e.g. loading the low word of a 2-word long),
--- e.g. Slot A 0 8 overlaps with Slot A 4 4.
---
--- The definition of a slot set is intended to reduce the number of overlap
--- checks we have to make. There's no reason to check for overlap between
--- slots in different areas, so we segregate the map by Area's.
--- We expect few slots in each Area, so we collect them in an unordered list.
--- To keep these lists short, any contiguous live slots are coalesced into
--- a single slot, on insertion.
-
-slotLattice :: DataflowLattice SubAreaSet
-slotLattice = DataflowLattice "live slots" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case Map.foldRightWithKey addArea (False, old) new of
-                                              (change, x) -> (changeIf change, x)
-        addArea a newSlots z = foldr (addSlot a) z newSlots
-        addSlot a slot (changed, map) =
-          let (c, live) = liveGen slot $ Map.findWithDefault [] a map
-          in (c || changed, Map.insert a live map)
-
-slotLatticeJoin :: [SubAreaSet] -> SubAreaSet
-slotLatticeJoin facts = foldr extend (fact_bot slotLattice) facts
-  where extend fact res = snd $ fact_join slotLattice undefined (OldFact fact) (NewFact res)
-
-type SlotEnv   = BlockEnv SubAreaSet
-  -- The sub-areas live on entry to the block
-
-liveSlotAnal :: CmmGraph -> FuelUniqSM SlotEnv
-liveSlotAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd slotLattice liveSlotTransfers
-
--- Add the subarea s to the subareas in the list-set (possibly coalescing it with
--- adjacent subareas), and also return whether s was a new addition.
-liveGen :: SubArea -> [SubArea] -> (Bool, [SubArea])
-liveGen s set = liveGen' s set []
-  where liveGen' s [] z = (True, s : z)
-        liveGen' s@(a, hi, w) (s'@(a', hi', w') : rst) z =
-          if a /= a' || hi < lo' || lo > hi' then    -- no overlap
-            liveGen' s rst (s' : z)
-          else if s' `contains` s then               -- old contains new
-            (False, set)
-          else                                       -- overlap: coalesce the slots
-            let new_hi = max hi hi'
-                new_lo = min lo lo'
-            in liveGen' (a, new_hi, new_hi - new_lo) rst z
-          where lo  = hi  - w  -- remember: areas grow down
-                lo' = hi' - w'
-        contains (a, hi, w) (a', hi', w') =
-          a == a' && hi >= hi' && hi - w <= hi' - w'
-
-liveKill :: SubArea -> [SubArea] -> [SubArea]
-liveKill (a, hi, w) set = -- pprTrace "killing slots in area" (ppr a) $
-                          liveKill' set []
-  where liveKill' [] z = z
-        liveKill' (s'@(a', hi', w') : rst) z =
-          if a /= a' || hi < lo' || lo > hi' then    -- no overlap
-            liveKill' rst (s' : z)
-          else                                       -- overlap: split the old slot
-            let z'  = if hi' > hi  then (a, hi', hi' - hi)  : z else z
-                z'' = if lo  > lo' then (a, lo,  lo  - lo') : z' else z'
-            in liveKill' rst z''
-          where lo  = hi  - w  -- remember: areas grow down
-                lo' = hi' - w'
-
--- Note: the stack slots that hold variables returned on the stack are not
--- considered live in to the block -- we treat the first node as a definition site.
--- BEWARE?: Am I being a little careless here in failing to check for the
--- entry Id (which would use the CallArea Old).
-liveSlotTransfers :: BwdTransfer CmmNode SubAreaSet
-liveSlotTransfers = mkBTransfer3 frt mid lst
-  where frt :: CmmNode C O -> SubAreaSet -> SubAreaSet
-        frt (CmmEntry l) f = Map.delete (CallArea (Young l)) f
-
-        mid :: CmmNode O O -> SubAreaSet -> SubAreaSet
-        mid n f = foldSlotsUsed addSlot (removeLiveSlotDefs f n) n
-        lst :: CmmNode O C -> FactBase SubAreaSet -> SubAreaSet
-        lst n f = liveInSlots n $ case n of
-          CmmCall {cml_cont=Nothing, cml_args=args} -> add_area (CallArea Old) args out
-          CmmCall {cml_cont=Just k, cml_args=args}  -> add_area (CallArea Old) args (add_area (CallArea (Young k)) args out)
-          CmmForeignCall {succ=k, updfr=oldend}     -> add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
-          _                                         -> out
-         where out = joinOutFacts slotLattice n f
-               add_area _ n live | n == 0 = live
-               add_area a n live = Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
-
--- Slot sets: adding slots, removing slots, and checking for membership.
-liftToArea :: Area -> ([SubArea] -> [SubArea]) -> SubAreaSet -> SubAreaSet 
-addSlot, removeSlot :: SubAreaSet -> SubArea -> SubAreaSet
-elemSlot            :: SubAreaSet -> SubArea -> Bool
-liftToArea a f map = Map.insert a (f (Map.findWithDefault [] a map)) map
-addSlot    live (a, i, w) = liftToArea a (snd . liveGen  (a, i, w)) live
-removeSlot live (a, i, w) = liftToArea a       (liveKill (a, i, w)) live
-elemSlot   live (a, i, w) =
-  not $ fst $ liveGen  (a, i, w) (Map.findWithDefault [] a live)
-
-removeLiveSlotDefs :: (DefinerOfSlots s, UserOfSlots s) => SubAreaSet -> s -> SubAreaSet
-removeLiveSlotDefs = foldSlotsDefd removeSlot
-
-liveInSlots :: (DefinerOfSlots s, UserOfSlots s) => s -> SubAreaSet -> SubAreaSet
-liveInSlots x live = foldSlotsUsed addSlot (removeLiveSlotDefs live x) x
-
-liveLastIn :: CmmNode O C -> (BlockId -> SubAreaSet) -> SubAreaSet
-liveLastIn l env = liveInSlots l (liveLastOut env l)
-
--- Don't forget to keep the outgoing parameters in the CallArea live,
--- as well as the update frame.
--- Note: We have to keep the update frame live at a call because of the
--- case where the function doesn't return -- in that case, there won't
--- be a return to keep the update frame live. We'd still better keep the
--- info pointer in the update frame live at any call site;
--- otherwise we could screw up the garbage collector.
-liveLastOut :: (BlockId -> SubAreaSet) -> CmmNode O C -> SubAreaSet
-liveLastOut env l =
-  case l of
-    CmmCall _ Nothing n _ _ -> 
-      add_area (CallArea Old) n out -- add outgoing args (includes upd frame)
-    CmmCall _ (Just k) n _ _ ->
-      add_area (CallArea Old) n (add_area (CallArea (Young k)) n out)
-    CmmForeignCall { succ = k, updfr = oldend } ->
-      add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
-    _ -> out
-  where out = slotLatticeJoin $ map env $ successors l
-        add_area _ n live | n == 0 = live
-        add_area a n live =
-          Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
-
--- The liveness analysis must be precise: otherwise, we won't know if a definition
--- should really kill a live-out stack slot.
--- But the interference graph does not have to be precise -- it might decide that
--- any live areas interfere. To maintain both a precise analysis and an imprecise
--- interference graph, we need to convert the live-out stack slots to graph nodes
--- at each and every instruction; rather than reconstruct a new list of nodes
--- every time, I provide a function to fold over the nodes, which should be a
--- reasonably efficient approach for the implementations we envision.
--- Of course, it will probably be much easier to program if we just return a list...
-type Set x = Map x ()
-data IGraphBuilder n =
-  Builder { foldNodes     :: forall z. SubArea -> (n -> z -> z) -> z -> z
-          , _wordsOccupied :: AreaSizeMap -> AreaMap -> n -> [Int]
-          }
-
-areaBuilder :: IGraphBuilder Area
-areaBuilder = Builder fold words
-  where fold (a, _, _) f z = f a z
-        words areaSize areaMap a =
-          case Map.lookup a areaMap of
-            Just addr -> [addr .. addr + (Map.lookup a areaSize `orElse`
-                                          pprPanic "wordsOccupied: unknown area" (ppr areaSize <+> ppr a))]
-            Nothing   -> []
-
---slotBuilder :: IGraphBuilder (Area, Int)
---slotBuilder = undefined
-
--- Now, we can build the interference graph.
--- The usual story: a definition interferes with all live outs and all other
--- definitions.
-type IGraph x = Map x (Set x)
-type IGPair x = (IGraph x, IGraphBuilder x)
-igraph :: (Ord x) => IGraphBuilder x -> SlotEnv -> CmmGraph -> IGraph x
-igraph builder env g = foldr interfere Map.empty (postorderDfs g)
-  where foldN = foldNodes builder
-        interfere block igraph = foldBlockNodesB3 (first, middle, last) block igraph
-          where first _ (igraph, _) = igraph
-                middle node (igraph, liveOut) =
-                  (addEdges igraph node liveOut, liveInSlots node liveOut)
-                last node igraph =
-                  (addEdges igraph node $ liveLastOut env' node, liveLastIn node env')
-
-                -- add edges between a def and the other defs and liveouts
-                addEdges igraph i out = fst $ foldSlotsDefd addDef (igraph, out) i
-                addDef (igraph, out) def@(a, _, _) =
-                  (foldN def (addDefN out) igraph,
-                   Map.insert a (snd $ liveGen def (Map.findWithDefault [] a out)) out)
-                addDefN out n igraph =
-                  let addEdgeNO o igraph = foldN o addEdgeNN igraph
-                      addEdgeNN n' igraph = addEdgeNN' n n' $ addEdgeNN' n' n igraph
-                      addEdgeNN' n n' igraph = Map.insert n (Map.insert n' () set) igraph
-                        where set = Map.findWithDefault Map.empty n igraph
-                  in Map.foldRightWithKey (\ _ os igraph -> foldr addEdgeNO igraph os) igraph out
-                env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
-
--- Before allocating stack slots, we need to collect one more piece of information:
--- what's the highest offset (in bytes) used in each Area?
--- We'll need to allocate that much space for each Area.
-
--- Mapping of areas to area sizes (not offsets!)
-type AreaSizeMap = AreaMap
-
--- JD: WHY CAN'T THIS COME FROM THE slot-liveness info?
-getAreaSize :: ByteOff -> CmmGraph -> AreaSizeMap
-  -- The domain of the returned mapping consists only of Areas
-  -- used for (a) variable spill slots, and (b) parameter passing areas for calls
-getAreaSize entry_off g =
-  foldGraphBlocks (foldBlockNodesF3 (first, add_regslots, last))
-              (Map.singleton (CallArea Old) entry_off) g
-  where first _  z = z
-        last :: CmmNode O C -> Map Area Int -> Map Area Int
-        last l@(CmmCall _ Nothing args res _) z  =  add_regslots l (add (add z area args) area res)
-          where area = CallArea Old
-        last l@(CmmCall _ (Just k) args res _) z =  add_regslots l (add (add z area args) area res)
-          where area = CallArea (Young k)
-        last l@(CmmForeignCall {succ = k}) z     =  add_regslots l (add z area wORD_SIZE)
-          where area = CallArea (Young k)
-        last l z                                 =  add_regslots l z
-        add_regslots i z = foldSlotsUsed addSlot (foldSlotsDefd addSlot z i) i
-        addSlot z (a@(RegSlot (LocalReg _ ty)), _, _) =
-          add z a $ widthInBytes $ typeWidth ty
-        addSlot z _ = z
-        add z a off = Map.insert a (max off (Map.findWithDefault 0 a z)) z
-	-- The 'max' is important.  Two calls, to f and g, might share a common
-	-- continuation (and hence a common CallArea), but their number of overflow
-	-- parameters might differ.
-        -- EZY: Ought to use insert with combining function...
-
-
--- Find the Stack slots occupied by the subarea's conflicts
-conflictSlots :: Ord x => IGPair x -> AreaSizeMap -> AreaMap -> SubArea -> Set Int
-conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
-  foldNodes subarea foldNode Map.empty
-  where foldNode n set = Map.foldRightWithKey conflict set $ Map.findWithDefault Map.empty n ig
-        conflict n' () set = liveInSlots areaMap n' set
-        -- Add stack slots occupied by igraph node n
-        liveInSlots areaMap n set = foldr setAdd set (wordsOccupied areaSize areaMap n)
-        setAdd w s = Map.insert w () s
-
--- Find any open space for 'area' on the stack, starting from the
--- 'offset'.  If the area is a CallArea or a spill slot for a pointer,
--- then it must be word-aligned.
-freeSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> Int
-freeSlotFrom ig areaSize offset areaMap area =
-  let size = Map.lookup area areaSize `orElse` 0
-      conflicts = conflictSlots ig areaSize areaMap (area, size, size)
-      -- CallAreas and Ptrs need to be word-aligned (round up!)
-      align = case area of CallArea _                                -> align'
-                           RegSlot  r | isGcPtrType (localRegType r) -> align'
-                           RegSlot  _                                -> id
-      align' n = (n + (wORD_SIZE - 1)) `div` wORD_SIZE * wORD_SIZE
-      -- Find a space big enough to hold the area
-      findSpace curr 0 = curr
-      findSpace curr cnt = -- part of target slot, # of bytes left to check
-        if Map.member curr conflicts then
-          findSpace (align (curr + size)) size -- try the next (possibly) open space
-        else findSpace (curr - 1) (cnt - 1)
-  in findSpace (align (offset + size)) size
-
--- Find an open space on the stack, and assign it to the area.
-allocSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> AreaMap
-allocSlotFrom ig areaSize from areaMap area =
-  if Map.member area areaMap then areaMap
-  else Map.insert area (freeSlotFrom ig areaSize from areaMap area) areaMap
-
--- Figure out all of the offsets from the slot location; this will be
--- non-zero for procpoints.
-type SpEntryMap = BlockEnv Int
-getSpEntryMap :: Int -> CmmGraph -> SpEntryMap
-getSpEntryMap entry_off g@(CmmGraph {g_entry = entry})
-    = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
-  where add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
-        add_sp_off b env =
-          case lastNode b of
-            CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
-            CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
-            _                                              -> env
-
--- | Greedy stack layout.
--- Compute liveness, build the interference graph, and allocate slots for the areas.
--- We visit each basic block in a (generally) forward order.
-
--- At each instruction that names a register subarea r, we immediately allocate
--- any available slot on the stack by the following procedure:
---  1. Find the sub-areas S that conflict with r
---  2. Find the stack slots used for S
---  3. Choose a contiguous stack space s not in S (s must be large enough to hold r)
-
--- For a CallArea, we allocate the stack space only when we reach a function
--- call that returns to the CallArea's blockId.
--- Then, we allocate the Area subject to the following constraints:
---   a) It must be younger than all the sub-areas that are live on entry to the block
---         This constraint is only necessary for the successor of a call
---   b) It must not overlap with any already-allocated Area with which it conflicts
---   	   (ie at some point, not necessarily now, is live at the same time)
---   Part (b) is just the 1,2,3 part above
-
--- Note: The stack pointer only has to be younger than the youngest live stack slot
--- at proc points. Otherwise, the stack pointer can point anywhere.
-
-layout :: ProcPointSet -> SpEntryMap -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
--- The domain of the returned map includes an Area for EVERY block
--- including each block that is not the successor of a call (ie is not a proc-point)
--- That's how we return the info of what the SP should be at the entry of every non
--- procpoint block.  However, note that procpoint blocks have their
--- /slot/ stored, which is not necessarily the value of the SP on entry
--- to the block (in fact, it probably isn't, due to argument passing).
--- See [Procpoint Sp offset]
-
-layout procPoints spEntryMap env entry_off g =
-  let ig = (igraph areaBuilder env g, areaBuilder)
-      env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
-      areaSize = getAreaSize entry_off g
-
-      -- Find the youngest live stack slot that has already been allocated
-      youngest_live :: AreaMap 	   -- Already allocated
-                    -> SubAreaSet  -- Sub-areas live here
-		    -> ByteOff     -- Offset of the youngest byte of any 
-		       		   --    already-allocated, live sub-area
-      youngest_live areaMap live = fold_subareas young_slot live 0
-        where young_slot (a, o, _) z = case Map.lookup a areaMap of
-                                         Just top -> max z $ top + o
-                                         Nothing  -> z
-              fold_subareas f m z = Map.foldRightWithKey (\_ s z -> foldr f z s) z m
-
-      -- Allocate space for spill slots and call areas
-      allocVarSlot = allocSlotFrom ig areaSize 0
-
-      -- Update the successor's incoming SP.
-      setSuccSPs inSp bid areaMap =
-        case (Map.lookup area areaMap , mapLookup bid (toBlockMap g)) of
-          (Just _, _) -> areaMap -- succ already knows incoming SP
-          (Nothing, Just _) ->
-            if setMember bid procPoints then
-              let young = youngest_live areaMap $ env' bid
-                  -- start = case returnOff stackInfo of Just b  -> max b young
-                  --                                     Nothing -> young
-                  start = young -- maybe wrong, but I don't understand
-                                -- why the preceding is necessary...
-              in  allocSlotFrom ig areaSize start areaMap area
-            else Map.insert area inSp areaMap
-          (_, Nothing) -> panic "Block not found in cfg"
-        where area = CallArea (Young bid)
-
-      layoutAreas areaMap block = foldBlockNodesF3 (flip const, allocMid, allocLast (entryLabel block)) block areaMap
-      allocMid m areaMap = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap m) m
-      allocLast bid l areaMap =
-        foldr (setSuccSPs inSp) areaMap' (successors l)
-        where inSp = slot + spOffset -- [Procpoint Sp offset]
-              -- If it's not in the map, we should use our previous
-              -- calculation unchanged.
-              spOffset = mapLookup bid spEntryMap `orElse` 0
-              slot = expectJust "slot in" $ Map.lookup (CallArea (Young bid)) areaMap
-              areaMap' = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap l) l
-      alloc' areaMap (a@(RegSlot _), _, _) = allocVarSlot areaMap a
-      alloc' areaMap _ = areaMap
-
-      initMap = Map.insert (CallArea (Young (g_entry g))) 0
-              . Map.insert (CallArea Old)                 0
-              $ Map.empty
-
-      areaMap = foldl layoutAreas initMap (postorderDfs g)
-  in -- pprTrace "ProcPoints" (ppr procPoints) $
-     -- pprTrace "Area SizeMap" (ppr areaSize) $
-     -- pprTrace "Entry offset" (ppr entry_off) $
-     -- pprTrace "Area Map" (ppr areaMap) $
-     areaMap
-
-{- Note [Procpoint Sp offset]
-
-The calculation of inSp is a little tricky.  (Un)fortunately, if you get
-it wrong, you will get inefficient but correct code.  You know you've
-got it wrong if the generated stack pointer bounces up and down for no
-good reason.
-
-Why can't we just set inSp to the location of the slot?  (This is what
-the code used to do.)  The trouble is when we actually hit the proc
-point the start of the slot will not be the same as the actual Sp due
-to argument passing:
-
-  a:
-      I32[(young<b> + 4)] = cde;
-      // Stack pointer is moved to young end (bottom) of young<b> for call
-      // +-------+
-      // | arg 1 |
-      // +-------+ <- Sp
-      call (I32[foobar::I32])(...) returns to Just b (4) (4) with update frame 4;
-  b:
-      // After call, stack pointer is above the old end (top) of
-      // young<b> (the difference is spOffset)
-      // +-------+ <- Sp
-      // | arg 1 |
-      // +-------+
-
-If we blithely set the Sp to be the same as the slot (the young end of
-young<b>), an adjustment will be necessary when we go to the next block.
-This is wasteful.  So, instead, for the next block after a procpoint,
-the actual Sp should be set to the same as the true Sp when we just
-entered the procpoint.  Then manifestSP will automatically do the right
-thing.
-
-Questions you may ask:
-
-1. Why don't we need to change the mapping for the procpoint itself?
-   Because manifestSP does its own calculation of the true stack value,
-   manifestSP will notice the discrepancy between the actual stack
-   pointer and the slot start, and adjust all of its memory accesses
-   accordingly.  So the only problem is when we adjust the Sp in
-   preparation for the successor block; that's why this code is here and
-   not in setSuccSPs.
-
-2. Why don't we make the procpoint call area and the true offset match
-   up?  If we did that, we would never use memory above the true value
-   of the stack pointer, thus wasting all of the stack we used to store
-   arguments.  You might think that some clever changes to the slot
-   offsets, using negative offsets, might fix it, but this does not make
-   semantic sense.
-
-3. If manifestSP is already calculating the true stack value, why we can't
-   do this trick inside manifestSP itself?  The reason is that if two
-   branches join with inconsistent SPs, one of them has to be fixed: we
-   can't know what the fix should be without already knowing what the
-   chosen location of SP is on the next successor.  (This is
-   the "succ already knows incoming SP" case), This calculation cannot
-   be easily done in manifestSP, since it processes the nodes
-   /backwards/.  So we need to have figured this out before we hit
-   manifestSP.
--}
-
--- After determining the stack layout, we can:
--- 1. Replace references to stack Areas with addresses relative to the stack
---    pointer.
--- 2. Insert adjustments to the stack pointer to ensure that it is at a
---    conventional location at each proc point.
---    Because we don't take interrupts on the execution stack, we only need the
---    stack pointer to be younger than the live values on the stack at proc points.
--- 3. Compute the maximum stack offset used in the procedure and replace
---    the stack high-water mark with that offset.
-manifestSP :: SpEntryMap -> AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
-manifestSP spEntryMap areaMap entry_off g@(CmmGraph {g_entry=entry}) =
-  ofBlockMap entry `liftM` foldl replB (return mapEmpty) (postorderDfs g)
-  where slot a = -- pprTrace "slot" (ppr a) $
-                   Map.lookup a areaMap `orElse` panic "unallocated Area"
-        slot' (Just id) = slot $ CallArea (Young id)
-        slot' Nothing   = slot $ CallArea Old
-        sp_high = maxSlot slot g
-        proc_entry_sp = slot (CallArea Old) + entry_off
-
-        spOffset id = mapLookup id spEntryMap `orElse` 0
-
-        sp_on_entry id | id == entry = proc_entry_sp
-        sp_on_entry id = slot' (Just id) + spOffset id
-
-        -- On entry to procpoints, the stack pointer is conventional;
-        -- otherwise, we check the SP set by predecessors.
-        replB :: FuelUniqSM (BlockEnv CmmBlock) -> CmmBlock -> FuelUniqSM (BlockEnv CmmBlock)
-        replB blocks block =
-          do let (head, middles, JustC tail :: MaybeC C (CmmNode O C)) = blockToNodeList block
-                 middles' = map (middle spIn) middles
-             bs <- replLast head middles' tail
-             flip (foldr insertBlock) bs `liftM` blocks
-          where spIn = sp_on_entry (entryLabel block)
-
-                middle spOff m = mapExpDeep (replSlot spOff) m
-                -- XXX there shouldn't be any global registers in the
-                -- CmmCall, so there shouldn't be any slots in
-                -- CmmCall... check that...
-                last   spOff l = mapExpDeep (replSlot spOff) l
-                replSlot spOff (CmmStackSlot a i) = CmmRegOff (CmmGlobal Sp) (spOff - (slot a + i))
-                replSlot _ (CmmLit CmmHighStackMark) = -- replacing the high water mark
-                  CmmLit (CmmInt (toInteger (max 0 (sp_high - proc_entry_sp))) (typeWidth bWord))
-                -- Invariant: Sp is always greater than SpLim.  Thus, if
-                -- the high water mark is zero, we can optimize away the
-                -- conditional branch.  Relies on dead code elimination
-                -- to get rid of the dead GC blocks.
-                -- EZY: Maybe turn this into a guard that checks if a
-                -- statement is stack-check ish?  Maybe we should make
-                -- an actual mach-op for it, so there's no chance of
-                -- mixing this up with something else...
-                replSlot _ (CmmMachOp (MO_U_Lt _)
-                              [CmmMachOp (MO_Sub _)
-                                         [ CmmReg (CmmGlobal Sp)
-                                         , CmmLit (CmmInt 0 _)],
-                               CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
-                replSlot _ e = e
-
-                replLast :: MaybeC C (CmmNode C O) -> [CmmNode O O] -> CmmNode O C -> FuelUniqSM [CmmBlock]
-                replLast h m l@(CmmCall _ k n _ _)       = updSp (slot' k + n) h m l
-                -- JD: LastForeignCall probably ought to have an outgoing
-                --     arg size, just like LastCall
-                replLast h m l@(CmmForeignCall {succ=k}) = updSp (slot' (Just k) + wORD_SIZE) h m l
-                replLast h m l@(CmmBranch k)             = updSp (sp_on_entry k) h m l
-                replLast h m l                           = uncurry (:) `liftM` foldr succ (return (b, [])) (successors l)
-                  where b :: CmmBlock
-                        b = updSp' spIn h m l
-                        succ succId z =
-                          let succSp = sp_on_entry succId in
-                          if succSp /= spIn then
-                            do (b,  bs)  <- z
-                               (b', bs') <- insertBetween b (adjustSp succSp) succId
-                               return (b', bs' ++ bs)
-                          else z
-
-                updSp sp h m l = return [updSp' sp h m l]
-                updSp' sp h m l | sp == spIn = blockOfNodeList (h, m, JustC $ last sp l)
-                                | otherwise  = blockOfNodeList (h, m ++ adjustSp sp, JustC $ last sp l)
-                adjustSp sp = [CmmAssign (CmmGlobal Sp) e]
-                  where e = CmmMachOp (MO_Add wordWidth) [CmmReg (CmmGlobal Sp), off]
-                        off = CmmLit $ CmmInt (toInteger $ spIn - sp) wordWidth
-
-
--- To compute the stack high-water mark, we fold over the graph and
--- compute the highest slot offset.
-maxSlot :: (Area -> Int) -> CmmGraph -> Int
-maxSlot slotOff g = foldGraphBlocks (foldBlockNodesF3 (flip const, highSlot, highSlot)) 0 g
-  where highSlot i z = foldSlotsUsed add (foldSlotsDefd add z i) i
-        add z (a, i, _) = max z (slotOff a + i)
-
------------------------------------------------------------------------------
--- | Sanity check: stub pointers immediately after they die
------------------------------------------------------------------------------
--- This will miss stack slots that are last used in a Last node,
--- but it should do pretty well...
-
-stubSlotsOnDeath :: CmmGraph -> FuelUniqSM CmmGraph
-stubSlotsOnDeath g = liftM fst $ dataflowPassBwd g [] $ analRewBwd slotLattice
-                                                                   liveSlotTransfers
-                                                                   rewrites
-    where rewrites = mkBRewrite3 frt mid lst
-          frt _ _ = return Nothing
-          mid m liveSlots = return $ foldSlotsUsed (stub liveSlots m) Nothing m
-          lst _ _ = return Nothing
-          stub liveSlots m rst subarea@(a, off, w) =
-            if elemSlot liveSlots subarea then rst
-            else let store = mkMiddle $ CmmStore (CmmStackSlot a off)
-                                                 (stackStubExpr (widthFromBytes w))
-                 in case rst of Nothing -> Just (mkMiddle m <*> store)
-                                Just g  -> Just (g <*> store)
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 0756c87583..f2e4d8e183 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -60,13 +60,14 @@ module CmmUtils(
         -- * Operations that probably don't belong here
         modifyGraph,
 
-        lastNode, replaceLastNode, insertBetween,
+        lastNode, replaceLastNode,
         ofBlockMap, toBlockMap, insertBlock,
         ofBlockList, toBlockList, bodyToBlockList,
         foldGraphBlocks, mapGraphNodes, postorderDfs, mapGraphNodes1,
       
         analFwd, analBwd, analRewFwd, analRewBwd,
-        dataflowPassFwd, dataflowPassBwd
+        dataflowPassFwd, dataflowPassBwd, dataflowAnalFwd, dataflowAnalBwd,
+        dataflowAnalFwdBlocks
   ) where
 
 #include "HsVersions.h"
@@ -79,7 +80,6 @@ import Cmm
 import BlockId
 import CLabel
 import Outputable
-import OptimizationFuel as F
 import Unique
 import UniqSupply
 import Constants( wORD_SIZE, tAG_MASK )
@@ -88,8 +88,7 @@ import Util
 import Data.Word
 import Data.Maybe
 import Data.Bits
-import Control.Monad
-import Compiler.Hoopl hiding ( Unique )
+import Hoopl
 
 ---------------------------------------------------
 --
@@ -402,13 +401,13 @@ mkLiveness (reg:regs)
 modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
 modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
 
-toBlockMap :: CmmGraph -> LabelMap CmmBlock
+toBlockMap :: CmmGraph -> BlockEnv CmmBlock
 toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 
-ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
+ofBlockMap :: BlockId -> BlockEnv CmmBlock -> CmmGraph
 ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
 
-insertBlock :: CmmBlock -> LabelMap CmmBlock -> LabelMap CmmBlock
+insertBlock :: CmmBlock -> BlockEnv CmmBlock -> BlockEnv CmmBlock
 insertBlock block map =
   ASSERT (isNothing $ mapLookup id map)
   mapInsert id block map
@@ -418,7 +417,8 @@ toBlockList :: CmmGraph -> [CmmBlock]
 toBlockList g = mapElems $ toBlockMap g
 
 ofBlockList :: BlockId -> [CmmBlock] -> CmmGraph
-ofBlockList entry blocks = CmmGraph {g_entry=entry, g_graph=GMany NothingO body NothingO}
+ofBlockList entry blocks = CmmGraph { g_entry = entry
+                                    , g_graph = GMany NothingO body NothingO }
   where body = foldr addBlock emptyBody blocks
 
 bodyToBlockList :: Body CmmNode -> [CmmBlock]
@@ -429,97 +429,77 @@ mapGraphNodes :: ( CmmNode C O -> CmmNode C O
                  , CmmNode O C -> CmmNode O C)
               -> CmmGraph -> CmmGraph
 mapGraphNodes funs@(mf,_,_) g =
-  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (blockMapNodes3 funs) $ toBlockMap g
+  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (mapBlock3' funs) $ toBlockMap g
 
 mapGraphNodes1 :: (forall e x. CmmNode e x -> CmmNode e x) -> CmmGraph -> CmmGraph
-mapGraphNodes1 f g = modifyGraph (graphMapBlocks (blockMapNodes f)) g
+mapGraphNodes1 f = modifyGraph (mapGraph f)
 
 
 foldGraphBlocks :: (CmmBlock -> a -> a) -> a -> CmmGraph -> a
 foldGraphBlocks k z g = mapFold k z $ toBlockMap g
 
 postorderDfs :: CmmGraph -> [CmmBlock]
-postorderDfs g = postorder_dfs_from (toBlockMap g) (g_entry g)
-
--------------------------------------------------
--- Manipulating CmmBlocks
-
-lastNode :: CmmBlock -> CmmNode O C
-lastNode block = foldBlockNodesF3 (nothing, nothing, const) block ()
-  where nothing :: a -> b -> ()
-        nothing _ _ = ()
-
-replaceLastNode :: Block CmmNode e C -> CmmNode O C -> Block CmmNode e C
-replaceLastNode block last = blockOfNodeList (first, middle, JustC last)
-  where (first, middle, _) = blockToNodeList block
-
-----------------------------------------------------------------------
------ Splicing between blocks
--- Given a middle node, a block, and a successor BlockId,
--- we can insert the middle node between the block and the successor.
--- We return the updated block and a list of new blocks that must be added
--- to the graph.
--- The semantics is a bit tricky. We consider cases on the last node:
--- o For a branch, we can just insert before the branch,
---   but sometimes the optimizer does better if we actually insert
---   a fresh basic block, enabling some common blockification.
--- o For a conditional branch, switch statement, or call, we must insert
---   a new basic block.
--- o For a jump or return, this operation is impossible.
-
-insertBetween :: MonadUnique m => CmmBlock -> [CmmNode O O] -> BlockId -> m (CmmBlock, [CmmBlock])
-insertBetween b ms succId = insert $ lastNode b
-  where insert :: MonadUnique m => CmmNode O C -> m (CmmBlock, [CmmBlock])
-        insert (CmmBranch bid) =
-          if bid == succId then
-            do (bid', bs) <- newBlocks
-               return (replaceLastNode b (CmmBranch bid'), bs)
-          else panic "tried invalid block insertBetween"
-        insert (CmmCondBranch c t f) =
-          do (t', tbs) <- if t == succId then newBlocks else return $ (t, [])
-             (f', fbs) <- if f == succId then newBlocks else return $ (f, [])
-             return (replaceLastNode b (CmmCondBranch c t' f'), tbs ++ fbs)
-        insert (CmmSwitch e ks) =
-          do (ids, bs) <- mapAndUnzipM mbNewBlocks ks
-             return (replaceLastNode b (CmmSwitch e ids), join bs)
-        insert (CmmCall {}) =
-          panic "unimp: insertBetween after a call -- probably not a good idea"
-        insert (CmmForeignCall {}) =
-          panic "unimp: insertBetween after a foreign call -- probably not a good idea"
-
-        newBlocks :: MonadUnique m => m (BlockId, [CmmBlock])
-        newBlocks = do id <- liftM mkBlockId $ getUniqueM
-                       return $ (id, [blockOfNodeList (JustC (CmmEntry id), ms, JustC (CmmBranch succId))])
-        mbNewBlocks :: MonadUnique m => Maybe BlockId -> m (Maybe BlockId, [CmmBlock])
-        mbNewBlocks (Just k) = if k == succId then liftM fstJust newBlocks
-                               else return (Just k, [])
-        mbNewBlocks Nothing  = return (Nothing, [])
-        fstJust (id, bs) = (Just id, bs)
+postorderDfs g = {-# SCC "postorderDfs" #-} postorder_dfs_from (toBlockMap g) (g_entry g)
 
 -------------------------------------------------
 -- Running dataflow analysis and/or rewrites
 
 -- Constructing forward and backward analysis-only pass
-analFwd    :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdPass m n f
-analBwd    :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdPass m n f
+analFwd    :: DataflowLattice f -> FwdTransfer n f -> FwdPass UniqSM n f
+analBwd    :: DataflowLattice f -> BwdTransfer n f -> BwdPass UniqSM n f
 
 analFwd lat xfer = analRewFwd lat xfer noFwdRewrite
 analBwd lat xfer = analRewBwd lat xfer noBwdRewrite
 
 -- Constructing forward and backward analysis + rewrite pass
-analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdRewrite m n f -> FwdPass m n f
-analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdRewrite m n f -> BwdPass m n f
+analRewFwd :: DataflowLattice f -> FwdTransfer n f
+           -> FwdRewrite UniqSM n f
+           -> FwdPass UniqSM n f
+
+analRewBwd :: DataflowLattice f
+           -> BwdTransfer n f
+           -> BwdRewrite UniqSM n f
+           -> BwdPass UniqSM n f
 
 analRewFwd lat xfer rew = FwdPass {fp_lattice = lat, fp_transfer = xfer, fp_rewrite = rew}
 analRewBwd lat xfer rew = BwdPass {bp_lattice = lat, bp_transfer = xfer, bp_rewrite = rew}
 
 -- Running forward and backward dataflow analysis + optional rewrite
-dataflowPassFwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> FwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowPassFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
 
-dataflowPassBwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> BwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowAnalFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd =
+  analyzeFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+
+dataflowAnalFwdBlocks :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (BlockEnv f)
+dataflowAnalFwdBlocks (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
+--  (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+--  return facts
+  return (analyzeFwdBlocks fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts))
+
+dataflowAnalBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd =
+  analyzeBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
+
+dataflowPassBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
diff --git a/compiler/cmm/Hoopl.hs b/compiler/cmm/Hoopl.hs
new file mode 100644
index 0000000000..0eca85cb8a
--- /dev/null
+++ b/compiler/cmm/Hoopl.hs
@@ -0,0 +1,125 @@
+module Hoopl (
+    module Compiler.Hoopl,
+    module Hoopl.Dataflow,
+    deepFwdRw, deepFwdRw3,
+    deepBwdRw, deepBwdRw3,
+    thenFwdRw
+  ) where
+
+import Compiler.Hoopl hiding
+  ( Unique,
+    FwdTransfer(..), FwdRewrite(..), FwdPass(..),
+    BwdTransfer(..), BwdRewrite(..), BwdPass(..),
+    noFwdRewrite, noBwdRewrite,
+    analyzeAndRewriteFwd, analyzeAndRewriteBwd,
+    mkFactBase, Fact,
+    mkBRewrite, mkBRewrite3, mkBTransfer, mkBTransfer3,
+    mkFRewrite, mkFRewrite3, mkFTransfer, mkFTransfer3,
+    deepFwdRw, deepFwdRw3, thenFwdRw, iterFwdRw,
+    deepBwdRw, deepBwdRw3, thenBwdRw, iterBwdRw
+  )
+
+import Hoopl.Dataflow
+import Control.Monad
+import UniqSupply
+
+deepFwdRw3 :: (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+           -> (FwdRewrite UniqSM n f)
+deepFwdRw :: (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x))) -> FwdRewrite UniqSM n f
+deepFwdRw3 f m l = iterFwdRw $ mkFRewrite3 f m l
+deepFwdRw f = deepFwdRw3 f f f
+
+-- N.B. rw3, rw3', and rw3a are triples of functions.
+-- But rw and rw' are single functions.
+thenFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f 
+          -> FwdRewrite UniqSM n f
+thenFwdRw rw3 rw3' = wrapFR2 thenrw rw3 rw3'
+ where
+  thenrw :: forall e x t t1.
+               (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> t
+            -> t1
+            -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+  thenrw rw rw' n f = rw n f >>= fwdRes
+     where fwdRes Nothing   = rw' n f
+           fwdRes (Just gr) = return $ Just $ fadd_rw rw3' gr
+
+iterFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f
+iterFwdRw rw3 = wrapFR iter rw3
+ where iter :: forall a e x t.
+               (t -> a -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+               -> t
+               -> a
+               -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+       iter rw n = (liftM $ liftM $ fadd_rw (iterFwdRw rw3)) . rw n
+
+-- | Function inspired by 'rew' in the paper
+_frewrite_cps :: ((Graph n e x, FwdRewrite UniqSM n f) -> UniqSM a)
+             -> UniqSM a
+             -> (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+             -> n e x
+             -> f
+             -> UniqSM a
+_frewrite_cps j n rw node f =
+    do mg <- rw node f
+       case mg of Nothing -> n
+                  Just gr -> j gr
+
+
+
+-- | Function inspired by 'add' in the paper
+fadd_rw :: FwdRewrite UniqSM n f
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+fadd_rw rw2 (g, rw1) = (g, rw1 `thenFwdRw` rw2)
+
+
+
+deepBwdRw3 ::
+              (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+           -> (BwdRewrite UniqSM n f)
+deepBwdRw  :: (forall e x . n e x -> Fact x f -> UniqSM (Maybe (Graph n e x)))
+           -> BwdRewrite UniqSM n f
+deepBwdRw3 f m l = iterBwdRw $ mkBRewrite3 f m l
+deepBwdRw  f = deepBwdRw3 f f f
+
+
+thenBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+thenBwdRw rw1 rw2 = wrapBR2 f rw1 rw2
+  where f :: forall t t1 t2 e x.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw1 rw2' n f = do
+          res1 <- rw1 n f
+          case res1 of
+            Nothing -> rw2' n f
+            Just gr -> return $ Just $ badd_rw rw2 gr
+
+iterBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+iterBwdRw rw = wrapBR f rw
+  where f :: forall t e x t1 t2.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw' n f = liftM (liftM (badd_rw (iterBwdRw rw))) (rw' n f)
+
+-- | Function inspired by 'add' in the paper
+badd_rw :: BwdRewrite UniqSM n f
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+badd_rw rw2 (g, rw1) = (g, rw1 `thenBwdRw` rw2)
diff --git a/compiler/cmm/Hoopl/Dataflow.hs b/compiler/cmm/Hoopl/Dataflow.hs
new file mode 100644
index 0000000000..e35beb93e9
--- /dev/null
+++ b/compiler/cmm/Hoopl/Dataflow.hs
@@ -0,0 +1,887 @@
+--
+-- Copyright (c) 2010, João Dias, Simon Marlow, Simon Peyton Jones,
+-- and Norman Ramsey
+--
+-- Modifications copyright (c) The University of Glasgow 2012
+--
+-- This module is a specialised and optimised version of
+-- Compiler.Hoopl.Dataflow in the hoopl package.  In particular it is
+-- specialised to the UniqSM monad.
+--
+
+{-# LANGUAGE RankNTypes, ScopedTypeVariables, GADTs, EmptyDataDecls, PatternGuards, TypeFamilies, MultiParamTypeClasses #-}
+#if __GLASGOW_HASKELL__ >= 703
+{-# OPTIONS_GHC -fprof-auto-top #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 701
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+module Hoopl.Dataflow
+  ( DataflowLattice(..), OldFact(..), NewFact(..), Fact, mkFactBase
+  , ChangeFlag(..)
+  , FwdPass(..), FwdTransfer, mkFTransfer, mkFTransfer3, getFTransfer3
+  -- * Respecting Fuel
+
+  -- $fuel
+  , FwdRewrite,  mkFRewrite,  mkFRewrite3,  getFRewrite3, noFwdRewrite
+  , wrapFR, wrapFR2
+  , BwdPass(..), BwdTransfer, mkBTransfer, mkBTransfer3, getBTransfer3
+  , wrapBR, wrapBR2
+  , BwdRewrite,  mkBRewrite,  mkBRewrite3,  getBRewrite3, noBwdRewrite
+  , analyzeAndRewriteFwd,  analyzeAndRewriteBwd
+  , analyzeFwd, analyzeFwdBlocks, analyzeBwd
+  )
+where
+
+import UniqSupply
+
+import Data.Maybe
+import Data.Array
+
+import Compiler.Hoopl hiding
+   ( mkBRewrite3, mkFRewrite3, noFwdRewrite, noBwdRewrite
+   , analyzeAndRewriteBwd, analyzeAndRewriteFwd
+   )
+import Compiler.Hoopl.Internals
+  ( wrapFR, wrapFR2
+  , wrapBR, wrapBR2
+  , splice
+  )
+
+
+-- -----------------------------------------------------------------------------
+
+noRewrite :: a -> b -> UniqSM (Maybe c)
+noRewrite _ _ = return Nothing
+
+noFwdRewrite :: FwdRewrite UniqSM n f
+noFwdRewrite = FwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+-- | Functions passed to 'mkFRewrite3' should not be aware of the fuel supply.
+-- The result returned by 'mkFRewrite3' respects fuel.
+mkFRewrite3 :: forall n f.
+               (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+            -> FwdRewrite UniqSM n f
+mkFRewrite3 f m l = FwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, FwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noFwdRewrite))
+
+noBwdRewrite :: BwdRewrite UniqSM n f
+noBwdRewrite = BwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+mkBRewrite3 :: forall n f.
+               (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+            -> BwdRewrite UniqSM n f
+mkBRewrite3 f m l = BwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, BwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noBwdRewrite))
+
+-----------------------------------------------------------------------------
+--              Analyze and rewrite forward: the interface
+-----------------------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeAndRewriteFwd
+   :: forall n f e x .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n  e x -> Fact e f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO x f)
+analyzeAndRewriteFwd pass entries g f =
+  do (rg, fout) <- arfGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedExitFact g' fout)
+
+distinguishedExitFact :: forall n e x f . Graph n e x -> Fact x f -> MaybeO x f
+distinguishedExitFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO x f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany _ _ x) = case x of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+----------------------------------------------------------------
+--       Forward Implementation
+----------------------------------------------------------------
+
+type Entries e = MaybeC e [Label]
+
+arfGraph :: forall n f e x .  NonLocal n =>
+            FwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+arfGraph pass@FwdPass { fp_lattice = lattice,
+                        fp_transfer = transfer,
+                        fp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARF  thing = forall e x . thing e x -> f        -> m (DG f n e x, Fact x f)
+    type ARFX thing = forall e x . thing e x -> Fact e f -> m (DG f n e x, Fact x f)
+    -}
+    graph ::              Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+    block :: forall e x .
+             Block n e x -> f -> UniqSM (DG f n e x, Fact x f)
+
+    body  :: [Label] -> LabelMap (Block n C C)
+          -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+
+    cat :: forall e a x f1 f2 f3.
+           (f1 -> UniqSM (DG f n e a, f2))
+        -> (f2 -> UniqSM (DG f n a x, f3))
+        -> (f1 -> UniqSM (DG f n e x, f3))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+      exit  :: MaybeO x (Block n C O)           -> Fact C f -> UniqSM (DG f n C x, Fact x f)
+      exit (JustO blk) f = arfx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` node n) f
+    block (BCons n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node :: forall e x . (ShapeLifter e x)
+         => n e x -> f -> UniqSM (DG f n e x, Fact x f)
+    node n f
+     = do { grw <- frewrite rewrite n f
+          ; case grw of
+              Nothing -> return ( singletonDG f n
+                                , ftransfer transfer n f )
+              Just (g, rw) ->
+                  let pass' = pass { fp_rewrite = rw }
+                      f'    = fwdEntryFact n f
+                  in  arfGraph pass' (fwdEntryLabel n) g f' }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g1,f1) <- ft1 f
+                       ; (g2,f2) <- ft2 f1
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f2) }
+
+    arfx :: forall x .
+            (Block n C x ->        f -> UniqSM (DG f n C x, Fact x f))
+         -> (Block n C x -> Fact C f -> UniqSM (DG f n C x, Fact x f))
+    arfx arf thing fb = 
+      arf thing $ fromJust $ lookupFact (entryLabel thing) $ joinInFacts lattice fb
+     -- joinInFacts adds debugging information
+
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Fwd lattice do_block entries blockmap init_fbase
+      where
+        lattice = fp_lattice pass
+        do_block :: forall x . Block n C x -> FactBase f
+                 -> UniqSM (DG f n C x, Fact x f)
+        do_block b fb = block b entryFact
+          where entryFact = getFact lattice (entryLabel b) fb
+
+
+-- Join all the incoming facts with bottom.
+-- We know the results _shouldn't change_, but the transfer
+-- functions might, for example, generate some debugging traces.
+joinInFacts :: DataflowLattice f -> FactBase f -> FactBase f
+joinInFacts (lattice @ DataflowLattice {fact_bot = bot, fact_join = fj}) fb =
+  mkFactBase lattice $ map botJoin $ mapToList fb
+    where botJoin (l, f) = (l, snd $ fj l (OldFact bot) (NewFact f))
+
+forwardBlockList :: (NonLocal n)
+                 => [Label] -> Body n -> [Block n C C]
+-- This produces a list of blocks in order suitable for forward analysis,
+-- along with the list of Labels it may depend on for facts.
+forwardBlockList entries blks = postorder_dfs_from blks entries
+
+----------------------------------------------------------------
+--       Forward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwd
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwd FwdPass { fp_lattice = lattice,
+                     fp_transfer = FwdTransfer3 (ftr, mtr, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = (ftr l `cat` (block b `cat` ltr n)) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` mtr n) f
+    block (BCons n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwdBlocks
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwdBlocks FwdPass { fp_lattice = lattice,
+                           fp_transfer = FwdTransfer3 (ftr, _, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n _)   f = ftr n f
+    block (BlockCC l _ n) f = (ftr l `cat` ltr n) f
+    block (BlockOC   _ n) f = ltr n f
+    block _               _ = error "analyzeFwdBlocks"
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+----------------------------------------------------------------
+--       Backward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeBwd
+   :: forall n f e .  NonLocal n =>
+      BwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact C f
+   -> FactBase f
+analyzeBwd BwdPass { bp_lattice = lattice,
+                     bp_transfer = BwdTransfer3 (ftr, mtr, ltr) }
+   entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact C f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Bwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> Fact x f -> FactBase f
+           do_block b fb = mapSingleton (entryLabel b) (block b fb)
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> Fact x f -> f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = ((ftr l `cat` block b) `cat` ltr n) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` mtr n) f
+    block (BCons n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f2 -> f3) -> (f1 -> f2) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft1 $! ft2 f
+
+-----------------------------------------------------------------------------
+--              Backward analysis and rewriting: the interface
+-----------------------------------------------------------------------------
+
+
+-- | if the graph being analyzed is open at the exit, I don't
+--   quite understand the implications of possible other exits
+analyzeAndRewriteBwd
+   :: NonLocal n
+   => BwdPass UniqSM n f
+   -> MaybeC e [Label] -> Graph n e x -> Fact x f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO e f)
+analyzeAndRewriteBwd pass entries g f =
+  do (rg, fout) <- arbGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedEntryFact g' fout)
+
+distinguishedEntryFact :: forall n e x f . Graph n e x -> Fact e f -> MaybeO e f
+distinguishedEntryFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO e f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany e _ _) = case e of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+
+-----------------------------------------------------------------------------
+--              Backward implementation
+-----------------------------------------------------------------------------
+
+arbGraph :: forall n f e x .
+            NonLocal n =>
+            BwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+arbGraph pass@BwdPass { bp_lattice  = lattice,
+                        bp_transfer = transfer,
+                        bp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARB  thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, f)
+    type ARBX thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, Fact e f)
+    -}
+    graph ::              Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+    block :: forall e x . Block n e x -> Fact x f -> UniqSM (DG f n e x, f)
+    body  :: [Label] -> Body n -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+    node  :: forall e x . (ShapeLifter e x) 
+             => n e x       -> Fact x f -> UniqSM (DG f n e x, f)
+    cat :: forall e a x info info' info''.
+           (info' -> UniqSM (DG f n e a, info''))
+        -> (info  -> UniqSM (DG f n a x, info'))
+        -> (info  -> UniqSM (DG f n e x, info''))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+      exit  :: MaybeO x (Block n C O)           -> Fact x f -> UniqSM (DG f n C x, Fact C f)
+      exit (JustO blk) f = arbx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` node n) f
+    block (BCons n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node n f
+      = do { bwdres <- brewrite rewrite n f
+           ; case bwdres of
+               Nothing -> return (singletonDG entry_f n, entry_f)
+                            where entry_f = btransfer transfer n f
+               Just (g, rw) ->
+                          do { let pass' = pass { bp_rewrite = rw }
+                             ; (g, f) <- arbGraph pass' (fwdEntryLabel n) g f
+                             ; return (g, bwdEntryFact lattice n f)} }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g2,f2) <- ft2 f
+                       ; (g1,f1) <- ft1 f2
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f1) }
+
+    arbx :: forall x .
+            (Block n C x -> Fact x f -> UniqSM (DG f n C x, f))
+         -> (Block n C x -> Fact x f -> UniqSM (DG f n C x, Fact C f))
+
+    arbx arb thing f = do { (rg, f) <- arb thing f
+                          ; let fb = joinInFacts (bp_lattice pass) $
+                                     mapSingleton (entryLabel thing) f
+                          ; return (rg, fb) }
+     -- joinInFacts adds debugging information
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Bwd (bp_lattice pass) do_block entries blockmap init_fbase
+      where
+        do_block :: forall x. Block n C x -> Fact x f -> UniqSM (DG f n C x, LabelMap f)
+        do_block b f = do (g, f) <- block b f
+                          return (g, mapSingleton (entryLabel b) f)
+
+
+{-
+
+The forward and backward cases are not dual.  In the forward case, the
+entry points are known, and one simply traverses the body blocks from
+those points.  In the backward case, something is known about the exit
+points, but this information is essentially useless, because we don't
+actually have a dual graph (that is, one with edges reversed) to
+compute with.  (Even if we did have a dual graph, it would not avail
+us---a backward analysis must include reachable blocks that don't
+reach the exit, as in a procedure that loops forever and has side
+effects.)
+
+-}
+
+-----------------------------------------------------------------------------
+--      fixpoint
+-----------------------------------------------------------------------------
+
+data Direction = Fwd | Bwd
+
+-- | fixpointing for analysis-only
+--
+fixpointAnal :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> Fact C f)
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> Fact C f -> FactBase f
+
+fixpointAnal direction DataflowLattice{ fact_bot = _, fact_join = join }
+              do_block entries blockmap init_fbase
+  = loop start init_fbase
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap      -- blocks still to analyse
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> FactBase f
+
+    loop []        fbase = fbase
+    loop (ix:todo) fbase =
+           let
+               blk = block_arr ! ix
+
+               out_facts = {-# SCC "do_block" #-} do_block blk fbase
+
+               !(todo', fbase') = {-# SCC "mapFoldWithKey" #-}
+                     mapFoldWithKey (updateFact join dep_blocks)
+                                    (todo,fbase) out_facts
+           in
+           -- trace ("analysing: " ++ show (entryLabel blk)) $
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           loop todo' fbase'
+
+
+-- | fixpointing for combined analysis/rewriting
+--
+fixpoint :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> UniqSM (DG f n C C, Fact C f))
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> (Fact C f -> UniqSM (DG f n C C, Fact C f))
+
+fixpoint direction DataflowLattice{ fact_bot = _, fact_join = join }
+         do_block entries blockmap init_fbase
+  = do
+        -- trace ("fixpoint: " ++ show (case direction of Fwd -> True; Bwd -> False) ++ " " ++ show (mapKeys blockmap) ++ show entries ++ " " ++ show (mapKeys init_fbase)) $ return()
+        (fbase, newblocks) <- loop start init_fbase mapEmpty
+        -- trace ("fixpoint DONE: " ++ show (mapKeys fbase) ++ show (mapKeys newblocks)) $ return()
+        return (GMany NothingO newblocks NothingO,
+                mapDeleteList (mapKeys blockmap) fbase)
+    -- The successors of the Graph are the the Labels
+    -- for which we have facts and which are *not* in
+    -- the blocks of the graph
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> LabelMap (DBlock f n C C)  -- transformed graph
+       -> UniqSM (FactBase f, LabelMap (DBlock f n C C))
+
+    loop [] fbase newblocks = return (fbase, newblocks)
+    loop (ix:todo) fbase !newblocks = do
+           let blk = block_arr ! ix
+
+           -- trace ("analysing: " ++ show (entryLabel blk)) $ return ()
+           (rg, out_facts) <- do_block blk fbase
+           let !(todo', fbase') =
+                  mapFoldWithKey (updateFact join dep_blocks)
+                                 (todo,fbase) out_facts
+
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           let newblocks' = case rg of
+                              GMany _ blks _ -> mapUnion blks newblocks
+     
+           loop todo' fbase' newblocks'
+
+
+{-  Note [TxFactBase invariants]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The TxFactBase is used only during a fixpoint iteration (or "sweep"),
+and accumulates facts (and the transformed code) during the fixpoint
+iteration.
+
+* tfb_fbase increases monotonically, across all sweeps
+
+* At the beginning of each sweep
+      tfb_cha  = NoChange
+      tfb_lbls = {}
+
+* During each sweep we process each block in turn.  Processing a block
+  is done thus:
+    1.  Read from tfb_fbase the facts for its entry label (forward)
+        or successors labels (backward)
+    2.  Transform those facts into new facts for its successors (forward)
+        or entry label (backward)
+    3.  Augment tfb_fbase with that info
+  We call the labels read in step (1) the "in-labels" of the sweep
+
+* The field tfb_lbls is the set of in-labels of all blocks that have
+  been processed so far this sweep, including the block that is
+  currently being processed.  tfb_lbls is initialised to {}.  It is a
+  subset of the Labels of the *original* (not transformed) blocks.
+
+* The tfb_cha field is set to SomeChange iff we decide we need to
+  perform another iteration of the fixpoint loop. It is initialsed to NoChange.
+
+  Specifically, we set tfb_cha to SomeChange in step (3) iff
+    (a) The fact in tfb_fbase for a block L changes
+    (b) L is in tfb_lbls
+  Reason: until a label enters the in-labels its accumuated fact in tfb_fbase
+  has not been read, hence cannot affect the outcome
+
+Note [Unreachable blocks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+A block that is not in the domain of tfb_fbase is "currently unreachable".
+A currently-unreachable block is not even analyzed.  Reason: consider 
+constant prop and this graph, with entry point L1:
+  L1: x:=3; goto L4
+  L2: x:=4; goto L4
+  L4: if x>3 goto L2 else goto L5
+Here L2 is actually unreachable, but if we process it with bottom input fact,
+we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
+
+* If a currently-unreachable block is not analyzed, then its rewritten
+  graph will not be accumulated in tfb_rg.  And that is good:
+  unreachable blocks simply do not appear in the output.
+
+* Note that clients must be careful to provide a fact (even if bottom)
+  for each entry point. Otherwise useful blocks may be garbage collected.
+
+* Note that updateFact must set the change-flag if a label goes from
+  not-in-fbase to in-fbase, even if its fact is bottom.  In effect the
+  real fact lattice is
+       UNR
+       bottom
+       the points above bottom
+
+* Even if the fact is going from UNR to bottom, we still call the
+  client's fact_join function because it might give the client
+  some useful debugging information.
+
+* All of this only applies for *forward* ixpoints.  For the backward
+  case we must treat every block as reachable; it might finish with a
+  'return', and therefore have no successors, for example.
+-}
+
+
+-----------------------------------------------------------------------------
+--  Pieces that are shared by fixpoint and fixpoint_anal
+-----------------------------------------------------------------------------
+
+-- | Sort the blocks into the right order for analysis.
+sortBlocks :: NonLocal n => Direction -> [Label] -> LabelMap (Block n C C)
+           -> [Block n C C]
+sortBlocks direction entries blockmap
+   = case direction of Fwd -> fwd
+                       Bwd -> reverse fwd
+  where fwd = forwardBlockList entries blockmap
+
+-- | construct a mapping from L -> block indices.  If the fact for L
+-- changes, re-analyse the given blocks.
+mkDepBlocks :: NonLocal n => Direction -> [Block n C C] -> LabelMap [Int]
+mkDepBlocks Fwd blocks = go blocks 0 mapEmpty
+  where go []     !_  m = m
+        go (b:bs) !n m = go bs (n+1) $! mapInsert (entryLabel b) [n] m
+mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
+  where go []     !_ m = m
+        go (b:bs) !n m = go bs (n+1) $! go' (successors b) m
+            where go' [] m = m
+                  go' (l:ls) m = go' ls (mapInsertWith (++) l [n] m)
+
+
+-- | After some new facts have been generated by analysing a block, we
+-- fold this function over them to generate (a) a list of block
+-- indices to (re-)analyse, and (b) the new FactBase.
+--
+updateFact :: JoinFun f -> LabelMap [Int]
+           -> Label -> f       -- out fact
+           -> (IntHeap, FactBase f)
+           -> (IntHeap, FactBase f)
+
+updateFact fact_join dep_blocks lbl new_fact (todo, fbase)
+  = case lookupFact lbl fbase of
+      Nothing       -> let !z = mapInsert lbl new_fact fbase in (changed, z)
+                           -- Note [no old fact]
+      Just old_fact ->
+        case fact_join lbl (OldFact old_fact) (NewFact new_fact) of
+          (NoChange, _) -> (todo, fbase)
+          (_,        f) -> let !z = mapInsert lbl f fbase in (changed, z)
+  where
+     changed = foldr insertIntHeap todo $
+                 mapFindWithDefault [] lbl dep_blocks
+
+{-
+Note [no old fact]
+
+We know that the new_fact is >= _|_, so we don't need to join.  However,
+if the new fact is also _|_, and we have already analysed its block,
+we don't need to record a change.  So there's a tradeoff here.  It turns
+out that always recording a change is faster.
+-}
+
+-----------------------------------------------------------------------------
+--      DG: an internal data type for 'decorated graphs'
+--          TOTALLY internal to Hoopl; each block is decorated with a fact
+-----------------------------------------------------------------------------
+
+type DG f  = Graph' (DBlock f)
+data DBlock f n e x = DBlock f (Block n e x) -- ^ block decorated with fact
+
+instance NonLocal n => NonLocal (DBlock f n) where
+  entryLabel (DBlock _ b) = entryLabel b
+  successors (DBlock _ b) = successors b
+
+--- constructors
+
+dgnil  :: DG f n O O
+dgnilC :: DG f n C C
+dgSplice  :: NonLocal n => DG f n e a -> DG f n a x -> DG f n e x
+
+---- observers
+
+normalizeGraph :: forall n f e x .
+                  NonLocal n => DG f n e x
+               -> (Graph n e x, FactBase f)
+                 -- A Graph together with the facts for that graph
+                 -- The domains of the two maps should be identical
+
+normalizeGraph g = (mapGraphBlocks dropFact g, facts g)
+    where dropFact :: DBlock t t1 t2 t3 -> Block t1 t2 t3
+          dropFact (DBlock _ b) = b
+          facts :: DG f n e x -> FactBase f
+          facts GNil = noFacts
+          facts (GUnit _) = noFacts
+          facts (GMany _ body exit) = bodyFacts body `mapUnion` exitFacts exit
+          exitFacts :: MaybeO x (DBlock f n C O) -> FactBase f
+          exitFacts NothingO = noFacts
+          exitFacts (JustO (DBlock f b)) = mapSingleton (entryLabel b) f
+          bodyFacts :: LabelMap (DBlock f n C C) -> FactBase f
+          bodyFacts body = mapFoldWithKey f noFacts body
+            where f :: forall t a x. (NonLocal t) => Label -> DBlock a t C x -> LabelMap a -> LabelMap a
+                  f lbl (DBlock f _) fb = mapInsert lbl f fb
+
+--- implementation of the constructors (boring)
+
+dgnil  = GNil
+dgnilC = GMany NothingO emptyBody NothingO
+
+dgSplice = splice fzCat
+  where fzCat :: DBlock f n e O -> DBlock t n O x -> DBlock f n e x
+        fzCat (DBlock f b1) (DBlock _ b2) = DBlock f $! b1 `blockAppend` b2
+        -- NB. strictness, this function is hammered.
+
+----------------------------------------------------------------
+--       Utilities
+----------------------------------------------------------------
+
+-- Lifting based on shape:
+--  - from nodes to blocks
+--  - from facts to fact-like things
+-- Lowering back:
+--  - from fact-like things to facts
+-- Note that the latter two functions depend only on the entry shape.
+class ShapeLifter e x where
+ singletonDG   :: f -> n e x -> DG f n e x
+ fwdEntryFact  :: NonLocal n => n e x -> f -> Fact e f
+ fwdEntryLabel :: NonLocal n => n e x -> MaybeC e [Label]
+ ftransfer :: FwdTransfer n f -> n e x -> f -> Fact x f
+ frewrite  :: FwdRewrite m n f -> n e x
+           -> f -> m (Maybe (Graph n e x, FwdRewrite m n f))
+-- @ end node.tex
+ bwdEntryFact :: NonLocal n => DataflowLattice f -> n e x -> Fact e f -> f
+ btransfer    :: BwdTransfer n f -> n e x -> Fact x f -> f
+ brewrite     :: BwdRewrite m n f -> n e x
+              -> Fact x f -> m (Maybe (Graph n e x, BwdRewrite m n f))
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+  fwdEntryFact     n f  = mapSingleton (entryLabel n) f
+  bwdEntryFact lat n fb = getFact lat (entryLabel n) fb
+  ftransfer (FwdTransfer3 (ft, _, _)) n f = ft n f
+  btransfer (BwdTransfer3 (bt, _, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (fr, _, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (br, _, _)) n f = br n f
+  fwdEntryLabel n = JustC [entryLabel n]
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, ft, _)) n f = ft n f
+  btransfer (BwdTransfer3 (_, bt, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, fr, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, br, _)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, _, ft)) n f = ft n f
+  btransfer (BwdTransfer3 (_, _, bt)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, _, fr)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, _, br)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+{-
+class ShapeLifter e x where
+  singletonDG   :: f -> n e x -> DG f n e x
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+-}
+
+-- Fact lookup: the fact `orelse` bottom
+getFact  :: DataflowLattice f -> Label -> FactBase f -> f
+getFact lat l fb = case lookupFact l fb of Just  f -> f
+                                           Nothing -> fact_bot lat
+
+
+
+{-  Note [Respects fuel]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-}
+-- $fuel
+-- A value of type 'FwdRewrite' or 'BwdRewrite' /respects fuel/ if 
+-- any function contained within the value satisfies the following properties:
+--
+--   * When fuel is exhausted, it always returns 'Nothing'.
+--
+--   * When it returns @Just g rw@, it consumes /exactly/ one unit
+--     of fuel, and new rewrite 'rw' also respects fuel.
+--
+-- Provided that functions passed to 'mkFRewrite', 'mkFRewrite3', 
+-- 'mkBRewrite', and 'mkBRewrite3' are not aware of the fuel supply,
+-- the results respect fuel.
+--
+-- It is an /unchecked/ run-time error for the argument passed to 'wrapFR',
+-- 'wrapFR2', 'wrapBR', or 'warpBR2' to return a function that does not respect fuel.
+
+-- -----------------------------------------------------------------------------
+-- a Heap of Int
+
+-- We should really use a proper Heap here, but my attempts to make
+-- one have not succeeded in beating the simple ordered list.  Another
+-- alternative is IntSet (using deleteFindMin), but that was also
+-- slower than the ordered list in my experiments --SDM 25/1/2012
+
+type IntHeap = [Int] -- ordered
+
+insertIntHeap :: Int -> [Int] -> [Int]
+insertIntHeap x [] = [x]
+insertIntHeap x (y:ys)
+  | x < y     = x : y : ys
+  | x == y    = x : ys
+  | otherwise = y : insertIntHeap x ys
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 04586b1029..443fa3a441 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -1,29 +1,19 @@
 {-# LANGUAGE GADTs #-}
 
--- ToDo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
--- ToDo: remove -fno-warn-incomplete-patterns
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
--- Module for building CmmAGraphs.
-
--- As the CmmAGraph is a wrapper over Graph CmmNode O x, it is different
--- from Hoopl's AGraph. The current clients expect functions with the
--- same names Hoopl uses, so this module cannot be in the same namespace
--- as Compiler.Hoopl.
-
 module MkGraph
-  ( CmmAGraph
-  , emptyAGraph, (<*>), catAGraphs, outOfLine
-  , mkLabel, mkMiddle, mkLast
-  , withFreshLabel, withUnique, lgraphOfAGraph, labelAGraph
+  ( CmmAGraph, CgStmt(..)
+  , (<*>), catAGraphs
+  , mkLabel, mkMiddle, mkLast, outOfLine
+  , lgraphOfAGraph, labelAGraph
 
   , stackStubExpr
-  , mkNop, mkAssign, mkStore, mkCall, mkCmmCall, mkSafeCall, mkUnsafeCall, mkFinalCall
-         , mkJump, mkDirectJump, mkForeignJump, mkJumpGC, mkCbranch, mkSwitch
-         , mkReturn, mkReturnSimple, mkComment, mkCallEntry
-         , mkBranch, mkCmmIfThenElse, mkCmmIfThen, mkCmmWhileDo
-         , copyInOflow, copyInSlot, copyOutOflow, copyOutSlot
+  , mkNop, mkAssign, mkStore, mkUnsafeCall, mkFinalCall, mkCallReturnsTo
+  , mkJump, mkDirectJump, mkForeignJump, mkForeignJumpExtra, mkJumpGC
+  , mkCbranch, mkSwitch
+  , mkReturn, mkReturnSimple, mkComment, mkCallEntry, mkBranch
+  , copyInOflow, copyOutOflow
+  , noExtraStack
+  , toCall, Transfer(..)
   )
 where
 
@@ -31,250 +21,232 @@ import BlockId
 import Cmm
 import CmmCallConv (assignArgumentsPos, ParamLocation(..))
 
+
 import Compiler.Hoopl hiding (Unique, (<*>), mkFirst, mkMiddle, mkLast, mkLabel, mkBranch, Shape(..))
-import qualified Compiler.Hoopl as H
-import Compiler.Hoopl.GHC (uniqueToLbl)
 import FastString
 import ForeignCall
 import Outputable
 import Prelude hiding (succ)
 import SMRep (ByteOff)
-import StaticFlags
-import Unique
 import UniqSupply
-import Util
+import OrdList
 
 #include "HsVersions.h"
 
-{-
-A 'CmmAGraph' is an abstract version of a 'Graph CmmNode O x' from module
-'Cmm'.  The difference is that the 'CmmAGraph' can be eigher open of closed at
-exit and it can supply fresh Labels and Uniques.
-
-It also supports a splicing operation <*>, which is different from the Hoopl's
-<*>, because it splices two CmmAGraphs. Specifically, it can splice Graph
-O C and Graph O x. In this case, the open beginning of the second graph is
-thrown away.  In the debug mode this sequence is checked to be empty or
-containing a branch (see note [Branch follows branch]).
-
-When an CmmAGraph open at exit is being converted to a CmmGraph, the output
-exit sequence is considered unreachable. If the graph consist of one block
-only, if it not the case and we crash. Otherwise we just throw the exit
-sequence away (and in debug mode we test that it really was unreachable).
--}
-
-{-
-Node [Branch follows branch]
-============================
-Why do we say it's ok for a Branch to follow a Branch?
-Because the standard constructor mkLabel has fall-through
-semantics. So if you do a mkLabel, you finish the current block,
-giving it a label, and start a new one that branches to that label.
-Emitting a Branch at this point is fine:
-       goto L1; L2: ...stuff...
--}
-
-data CmmGraphOC = Opened (Graph CmmNode O O)
-                | Closed (Graph CmmNode O C)
-type CmmAGraph = UniqSM CmmGraphOC     -- Graph open at entry
-
-{-
-MS: I began with
-  newtype CmmAGraph = forall x. AG (UniqSM (Graph CmmNode O x))
-but that does not work well, because we cannot take the graph
-out of the monad -- we do not know the type of what we would take
-out and pattern matching does not help, as we cannot pattern match
-on a graph inside the monad.
--}
 
-data Transfer = Call | Jump | Ret deriving Eq
+-----------------------------------------------------------------------------
+-- Building Graphs
+
+
+-- | CmmAGraph is a chunk of code consisting of:
+--
+--   * ordinary statements (assignments, stores etc.)
+--   * jumps
+--   * labels
+--   * out-of-line labelled blocks
+--
+-- The semantics is that control falls through labels and out-of-line
+-- blocks.  Everything after a jump up to the next label is by
+-- definition unreachable code, and will be discarded.
+--
+-- Two CmmAGraphs can be stuck together with <*>, with the meaning that
+-- control flows from the first to the second.
+--
+-- A 'CmmAGraph' can be turned into a 'CmmGraph' (closed at both ends)
+-- by providing a label for the entry point; see 'labelAGraph'.
+--
+type CmmAGraph = OrdList CgStmt
+
+data CgStmt
+  = CgLabel BlockId
+  | CgStmt  (CmmNode O O)
+  | CgLast  (CmmNode O C)
+  | CgFork  BlockId CmmAGraph
+
+flattenCmmAGraph :: BlockId -> CmmAGraph -> CmmGraph
+flattenCmmAGraph id stmts =
+    CmmGraph { g_entry = id,
+               g_graph = GMany NothingO body NothingO }
+  where
+  (block, blocks) = flatten (fromOL stmts)
+  entry = blockJoinHead (CmmEntry id) block
+  body = foldr addBlock emptyBody (entry:blocks)
+
+  flatten :: [CgStmt] -> (Block CmmNode O C, [Block CmmNode C C])
+  flatten [] = panic "flatten []"
+
+  -- A label at the end of a function or fork: this label must not be reachable,
+  -- but it might be referred to from another BB that also isn't reachable.
+  -- Eliminating these has to be done with a dead-code analysis.  For now,
+  -- we just make it into a well-formed block by adding a recursive jump.
+  flatten [CgLabel id]
+    = (goto_id, [blockJoinHead (CmmEntry id) goto_id] )
+    where goto_id = blockJoinTail emptyBlock (CmmBranch id)
+
+  -- A jump/branch: throw away all the code up to the next label, because
+  -- it is unreachable.  Be careful to keep forks that we find on the way.
+  flatten (CgLast stmt : stmts)
+    = case dropWhile isOrdinaryStmt stmts of
+        [] ->
+            ( sing, [] )
+        [CgLabel id] ->
+            ( sing, [blockJoin (CmmEntry id) emptyBlock (CmmBranch id)] )
+        (CgLabel id : stmts) ->
+            ( sing, blockJoinHead (CmmEntry id) block : blocks )
+            where (block,blocks) = flatten stmts
+        (CgFork fork_id stmts : ss) -> 
+            flatten (CgFork fork_id stmts : CgLast stmt : ss)
+        _ -> panic "MkGraph.flatten"
+    where
+      sing = blockJoinTail emptyBlock stmt
+
+  flatten (s:ss) = 
+        case s of
+          CgStmt stmt -> (blockCons stmt block, blocks)
+          CgLabel id  -> (blockJoinTail emptyBlock (CmmBranch id),
+                          blockJoinHead (CmmEntry id) block : blocks)
+          CgFork fork_id stmts -> 
+                (block, blockJoinHead (CmmEntry fork_id) fork_block : fork_blocks ++ blocks)
+                where (fork_block, fork_blocks) = flatten (fromOL stmts)
+          _ -> panic "MkGraph.flatten"
+    where (block,blocks) = flatten ss
+
+isOrdinaryStmt :: CgStmt -> Bool
+isOrdinaryStmt (CgStmt _) = True
+isOrdinaryStmt (CgLast _) = True
+isOrdinaryStmt _          = False
+
+
 
 ---------- AGraph manipulation
 
-emptyAGraph    :: CmmAGraph
 (<*>)          :: CmmAGraph -> CmmAGraph -> CmmAGraph
+(<*>)           = appOL
+
 catAGraphs     :: [CmmAGraph] -> CmmAGraph
+catAGraphs      = concatOL
+
+-- | created a sequence "goto id; id:" as an AGraph
+mkLabel        :: BlockId -> CmmAGraph
+mkLabel bid     = unitOL (CgLabel bid)
 
-mkLabel        :: BlockId     -> CmmAGraph  -- created a sequence "goto id; id:" as an AGraph
-mkMiddle       :: CmmNode O O -> CmmAGraph  -- creates an open AGraph from a given node
-mkLast         :: CmmNode O C -> CmmAGraph  -- created a closed AGraph from a given node
+-- | creates an open AGraph from a given node
+mkMiddle        :: CmmNode O O -> CmmAGraph
+mkMiddle middle = unitOL (CgStmt middle)
 
-withFreshLabel :: String -> (BlockId -> CmmAGraph) -> CmmAGraph
-withUnique     :: (Unique -> CmmAGraph) -> CmmAGraph
+-- | created a closed AGraph from a given node
+mkLast         :: CmmNode O C -> CmmAGraph
+mkLast last     = unitOL (CgLast last)
 
+-- | A labelled code block; should end in a last node
+outOfLine      :: BlockId -> CmmAGraph -> CmmAGraph
+outOfLine l g   = unitOL (CgFork l g)
+
+-- | allocate a fresh label for the entry point
 lgraphOfAGraph :: CmmAGraph -> UniqSM CmmGraph
-  -- ^ allocate a fresh label for the entry point
+lgraphOfAGraph g = do u <- getUniqueM
+                      return (flattenCmmAGraph (mkBlockId u) g)
+
+-- | use the given BlockId as the label of the entry point
 labelAGraph    :: BlockId -> CmmAGraph -> UniqSM CmmGraph
-  -- ^ use the given BlockId as the label of the entry point
+labelAGraph lbl ag = return (flattenCmmAGraph lbl ag)
 
 ---------- No-ops
 mkNop        :: CmmAGraph
+mkNop         = nilOL
+
 mkComment    :: FastString -> CmmAGraph
+#ifdef DEBUG
+-- SDM: generating all those comments takes time, this saved about 4% for me
+mkComment fs  = mkMiddle $ CmmComment fs
+#else
+mkComment _   = nilOL
+#endif
 
 ---------- Assignment and store
 mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
-mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkAssign l r  = mkMiddle $ CmmAssign l r
 
----------- Calls
-mkCall       :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-mkCmmCall    :: CmmExpr ->              [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-  -- Native C-- calling convention
-mkSafeCall    :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset -> Bool -> CmmAGraph
-mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
-mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-  -- Never returns; like exit() or barf()
+mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkStore  l r  = mkMiddle $ CmmStore  l r
 
 ---------- Control transfer
-mkJump          ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkDirectJump    ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkJumpGC        ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkCbranch       :: CmmExpr -> BlockId -> BlockId          -> CmmAGraph
-mkSwitch        :: CmmExpr -> [Maybe BlockId]             -> CmmAGraph
+mkJump          :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeNodeCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkDirectJump    :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkDirectJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeDirectCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkJumpGC        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJumpGC e actuals updfr_off =
+  lastWithArgs Jump Old GC actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+                -> CmmAGraph
+mkForeignJump conv e actuals updfr_off =
+  mkForeignJumpExtra conv e actuals updfr_off noExtraStack
+
+mkForeignJumpExtra :: Convention -> CmmExpr -> [CmmActual]
+                -> UpdFrameOffset -> (ByteOff, [(CmmExpr, ByteOff)])
+                -> CmmAGraph
+mkForeignJumpExtra conv e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack Jump Old conv actuals updfr_off extra_stack $
+    toCall e Nothing updfr_off 0
+
+mkCbranch       :: CmmExpr -> BlockId -> BlockId -> CmmAGraph
+mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
+
+mkSwitch        :: CmmExpr -> [Maybe BlockId] -> CmmAGraph
+mkSwitch e tbl   = mkLast $ CmmSwitch e tbl
+
 mkReturn        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturn e actuals updfr_off =
+  lastWithArgs Ret  Old NativeReturn actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
 mkReturnSimple  :: [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple actuals updfr_off =
+  mkReturn e actuals updfr_off
+  where e = CmmLoad (CmmStackSlot Old updfr_off) gcWord
 
 mkBranch        :: BlockId -> CmmAGraph
-mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> CmmAGraph
-mkCmmIfThen     :: CmmExpr -> CmmAGraph -> CmmAGraph
-mkCmmWhileDo    :: CmmExpr -> CmmAGraph -> CmmAGraph
-
-outOfLine       :: CmmAGraph -> CmmAGraph
--- ^ The argument is an CmmAGraph that must have an
--- empty entry sequence and be closed at the end.
--- The result is a new CmmAGraph that is open at the
--- end and goes directly from entry to exit, with the
--- original graph sitting to the side out-of-line.
---
--- Example:  mkMiddle (x = 3)
---           <*> outOfLine (mkLabel L <*> ...stuff...)
---           <*> mkMiddle (y = x)
--- Control will flow directly from x=3 to y=x;
--- the block starting with L is "on the side".
---
--- N.B. algebraically forall g g' : g <*> outOfLine g' == outOfLine g' <*> g
+mkBranch bid     = mkLast (CmmBranch bid)
+
+mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset
+              -> CmmAGraph
+mkFinalCall f _ actuals updfr_off =
+  lastWithArgs Call Old NativeDirectCall actuals updfr_off $
+    toCall f Nothing updfr_off 0
+
+mkCallReturnsTo :: CmmExpr -> Convention -> [CmmActual]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> (ByteOff, [(CmmExpr,ByteOff)])
+                -> CmmAGraph
+mkCallReturnsTo f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
+  lastWithArgsAndExtraStack Call (Young ret_lbl) callConv actuals
+     updfr_off extra_stack $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
+mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
+
 
 --------------------------------------------------------------------------
 
--- ================ IMPLEMENTATION ================--
-
---------------------------------------------------
--- Raw CmmAGraph handling
-
-emptyAGraph = return $ Opened emptyGraph
-ag <*> ah = do g <- ag
-               h <- ah
-               return (case (g, h) of
-                 (Opened g, Opened h) -> Opened $ g H.<*> h
-                 (Opened g, Closed h) -> Closed $ g H.<*> h
-                 (Closed g, Opened GNil) -> Closed g
-                 (Closed g, Opened (GUnit e)) -> note_unreachable e $ Closed g
-                 (Closed g, Opened (GMany (JustO e) b x)) -> note_unreachable e $ Opened $ g H.|*><*| GMany NothingO b x
-                 (Closed g, Closed (GMany (JustO e) b x)) -> note_unreachable e $ Closed $ g H.|*><*| GMany NothingO b x
-                 :: CmmGraphOC)
-catAGraphs = foldl (<*>) emptyAGraph
-
-outOfLine ag = withFreshLabel "outOfLine" $ \l ->
-               do g <- ag
-                  return (case g of
-                    Closed (GMany (JustO e) b _) -> note_unreachable e $ Opened $
-                                                      GMany (JustO $ BLast $ CmmBranch l) b (JustO $ BFirst $ CmmEntry l)
-                    _                            -> panic "outOfLine"
-                    :: CmmGraphOC)
-
-note_unreachable :: Block CmmNode O x -> a -> a
-note_unreachable block graph =
-  ASSERT (block_is_empty_or_label)  -- Note [Branch follows branch]
-  graph
-  where block_is_empty_or_label :: Bool
-        block_is_empty_or_label = case blockToNodeList block of
-                                    (NothingC, [], NothingC)            -> True
-                                    (NothingC, [], JustC (CmmBranch _)) -> True
-                                    _                                   -> False
-
-mkLabel bid = return $ Opened $ H.mkLast (CmmBranch bid) |*><*| H.mkFirst (CmmEntry bid)
-mkMiddle middle = return $ Opened $ H.mkMiddle middle
-mkLast last = return $ Closed $ H.mkLast last
-
-withUnique f = getUniqueM >>= f
-withFreshLabel _name f = getUniqueM >>= f . uniqueToLbl . intToUnique . getKey
 
-lgraphOfAGraph g = do u <- getUniqueM
-                      labelAGraph (mkBlockId u) g
-
-labelAGraph lbl ag = do g <- ag
-                        return $ CmmGraph {g_entry=lbl, g_graph=H.mkFirst (CmmEntry lbl) H.<*> closed g}
-  where closed :: CmmGraphOC -> Graph CmmNode O C
-        closed (Closed g) = g
-        closed (Opened g@(GMany entry body (JustO exit))) =
-          ASSERT (entryLabel exit `notElem` map entryLabel (postorder_dfs g))
-          GMany entry body NothingO
-        closed (Opened _) = panic "labelAGraph"
-
---------------------------------------------------
--- CmmAGraph constructions
-
-mkNop                     = emptyAGraph
-mkComment fs              = mkMiddle $ CmmComment fs
-mkStore  l r              = mkMiddle $ CmmStore  l r
-
--- NEED A COMPILER-DEBUGGING FLAG HERE
--- Sanity check: any value assigned to a pointer must be non-zero.
--- If it's 0, cause a crash immediately.
-mkAssign l r = if opt_StubDeadValues then assign l r <*> check l else assign l r
-  where assign l r = mkMiddle (CmmAssign l r)
-        check (CmmGlobal _) = mkNop
-        check l@(CmmLocal reg) = -- if a ptr arg is NULL, cause a crash!
-          if isGcPtrType ty then
-            mkCmmIfThen (CmmMachOp (MO_Eq w) [r, stackStubExpr w])
-                        (assign l (CmmLoad (CmmLit (CmmInt 0 w)) ty))
-          else mkNop
-            where ty = localRegType reg
-                  w  = typeWidth ty
-                  r  = CmmReg l
 
 
 -- Why are we inserting extra blocks that simply branch to the successors?
 -- Because in addition to the branch instruction, @mkBranch@ will insert
 -- a necessary adjustment to the stack pointer.
-mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
-mkSwitch e tbl            = mkLast $ CmmSwitch e tbl
-
-mkSafeCall   t fs as upd i = withFreshLabel "safe call" $ body
-  where
-    body k =
-     (    mkStore (CmmStackSlot (CallArea (Young k)) (widthInBytes wordWidth))
-                  (CmmLit (CmmBlock k))
-      <*> mkLast (CmmForeignCall {tgt=t, res=fs, args=as, succ=k, updfr=upd, intrbl=i})
-      <*> mkLabel k)
-mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
 
-mkBranch bid = mkLast (CmmBranch bid)
-
-mkCmmIfThenElse e tbranch fbranch =
-  withFreshLabel "end of if"     $ \endif ->
-  withFreshLabel "start of then" $ \tid ->
-  withFreshLabel "start of else" $ \fid ->
-    mkCbranch e tid fid <*>
-    mkLabel tid <*> tbranch <*> mkBranch endif <*>
-    mkLabel fid <*> fbranch <*> mkLabel endif
-
-mkCmmIfThen e tbranch
-  = withFreshLabel "end of if"     $ \endif ->
-    withFreshLabel "start of then" $ \tid ->
-      mkCbranch e tid endif <*>
-      mkLabel tid <*> tbranch <*> mkLabel endif
-
-mkCmmWhileDo e body =
-  withFreshLabel "loop test" $ \test ->
-  withFreshLabel "loop head" $ \head ->
-  withFreshLabel "end while" $ \endwhile ->
-    -- Forrest Baskett's while-loop layout
-    mkBranch test <*> mkLabel head <*> body
-                  <*> mkLabel test <*> mkCbranch e head endwhile
-                  <*> mkLabel endwhile
 
 -- For debugging purposes, we can stub out dead stack slots:
 stackStubExpr :: Width -> CmmExpr
@@ -286,12 +258,9 @@ stackStubExpr w = CmmLit (CmmInt 0 w)
 -- Therefore, for copying arguments and results, we provide different
 -- functions to pass the arguments in an overflow area and to pass them in spill slots.
 copyInOflow  :: Convention -> Area -> [CmmFormal] -> (Int, CmmAGraph)
-copyInSlot   :: Convention -> [CmmFormal] -> [CmmNode O O]
-copyOutSlot  :: Convention -> [LocalReg] -> [CmmNode O O]
 
 copyInOflow conv area formals = (offset, catAGraphs $ map mkMiddle nodes)
   where (offset, nodes) = copyIn oneCopyOflowI conv area formals
-copyInSlot c f = snd $ copyIn oneCopySlotI c (panic "no area for copying to slots") f
 
 type SlotCopier = Area -> (LocalReg, ByteOff) -> (ByteOff, [CmmNode O O]) ->
                           (ByteOff, [CmmNode O O])
@@ -312,104 +281,91 @@ copyIn oflow conv area formals =
                 adjust rst x@(_, RegisterParam _) = x : rst
 
 -- Copy-in one arg, using overflow space if needed.
-oneCopyOflowI, oneCopySlotI :: SlotCopier
+oneCopyOflowI :: SlotCopier
 oneCopyOflowI area (reg, off) (n, ms) =
   (max n off, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty) : ms)
   where ty = localRegType reg
 
--- Copy-in one arg, using spill slots if needed -- used for calling conventions at
--- a procpoint that is not a return point. The offset is irrelevant here...
-oneCopySlotI _ (reg, _) (n, ms) =
-  (n, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg) w) ty) : ms)
-  where ty = localRegType reg
-        w  = widthInBytes (typeWidth ty)
-
-
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
 
-copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual] -> UpdFrameOffset ->
-                              (Int, CmmAGraph)
+data Transfer = Call | Jump | Ret deriving Eq
+
+copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff, [(CmmExpr,ByteOff)]) -- extra stack stuff
+             -> (Int, [GlobalReg], CmmAGraph)
+
 -- Generate code to move the actual parameters into the locations
--- required by the calling convention.  This includes a store for the return address.
+-- required by the calling convention.  This includes a store for the
+-- return address.
 --
--- The argument layout function ignores the pointer to the info table, so we slot that
--- in here. When copying-out to a young area, we set the info table for return
--- and adjust the offsets of the other parameters.
--- If this is a call instruction, we adjust the offsets of the other parameters.
-copyOutOflow conv transfer area@(CallArea a) actuals updfr_off
-  = foldr co (init_offset, emptyAGraph) args'
+-- The argument layout function ignores the pointer to the info table,
+-- so we slot that in here. When copying-out to a young area, we set
+-- the info table for return and adjust the offsets of the other
+-- parameters.  If this is a call instruction, we adjust the offsets
+-- of the other parameters.
+copyOutOflow conv transfer area actuals updfr_off
+  (extra_stack_off, extra_stack_stuff)
+  = foldr co (init_offset, [], mkNop) (args' ++ stack_params)
   where 
-    co (v, RegisterParam r) (n, ms) = (n, mkAssign (CmmGlobal r) v <*> ms)
-    co (v, StackParam off)  (n, ms) = (max n off, mkStore (CmmStackSlot area off) v <*> ms)
+    co (v, RegisterParam r) (n, rs, ms)
+       = (n, r:rs, mkAssign (CmmGlobal r) v <*> ms)
+    co (v, StackParam off)  (n, rs, ms)
+       = (max n off, rs, mkStore (CmmStackSlot area off) v <*> ms)
+
+    stack_params = [ (e, StackParam (off + init_offset))
+                   | (e,off) <- extra_stack_stuff ]
 
     (setRA, init_offset) =
-      case a of Young id -> id `seq` -- Generate a store instruction for
-                                     -- the return address if making a call
+      case area of
+            Young id -> id `seq` -- Generate a store instruction for
+                                 -- the return address if making a call
                   if transfer == Call then
                     ([(CmmLit (CmmBlock id), StackParam init_offset)],
                      widthInBytes wordWidth)
                   else ([], 0)
-                Old -> ([], updfr_off)
+            Old -> ([], updfr_off)
+
+    arg_offset = init_offset + extra_stack_off
 
     args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
     args = assignArgumentsPos conv cmmExprType actuals
 
     args' = foldl adjust setRA args
-      where adjust rst   (v, StackParam off)  = (v, StackParam (off + init_offset)) : rst
+      where adjust rst   (v, StackParam off)  = (v, StackParam (off + arg_offset)) : rst
             adjust rst x@(_, RegisterParam _) = x : rst
 
-copyOutOflow _ _ (RegSlot _) _ _ = panic "cannot copy arguments into a register slot"
 
--- Args passed only in registers and stack slots; no overflow space.
--- No return address may apply!
-copyOutSlot conv actuals = foldr co [] args
-  where co (v, RegisterParam r) ms = CmmAssign (CmmGlobal r) (toExp v) : ms
-        co (v, StackParam off)  ms = CmmStore  (CmmStackSlot (RegSlot v) off) (toExp v) : ms
-        toExp r = CmmReg (CmmLocal r)
-        args = assignArgumentsPos conv localRegType actuals
 
 mkCallEntry :: Convention -> [CmmFormal] -> (Int, CmmAGraph)
-mkCallEntry conv formals = copyInOflow conv (CallArea Old) formals
+mkCallEntry conv formals = copyInOflow conv Old formals
 
-lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual] -> UpdFrameOffset ->
-                (ByteOff -> CmmAGraph) -> CmmAGraph
+lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
 lastWithArgs transfer area conv actuals updfr_off last =
-  let (outArgs, copies) = copyOutOflow conv transfer area actuals updfr_off in
-  copies <*> last outArgs
-
--- The area created for the jump and return arguments is the same area as the
--- procedure entry.
-old :: Area
-old = CallArea Old
-toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff -> CmmAGraph
-toCall e cont updfr_off res_space arg_space =
-  mkLast $ CmmCall e cont arg_space res_space updfr_off
-mkJump e actuals updfr_off =
-  lastWithArgs Jump old NativeNodeCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkDirectJump e actuals updfr_off =
-  lastWithArgs Jump old NativeDirectCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkJumpGC e actuals updfr_off =
-  lastWithArgs Jump old GC actuals updfr_off $ toCall e Nothing updfr_off 0
-mkForeignJump conv e actuals updfr_off =
-  lastWithArgs Jump old conv actuals updfr_off $ toCall e Nothing updfr_off 0
-mkReturn e actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    -- where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-mkReturnSimple actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-
-mkFinalCall f _ actuals updfr_off =
-  lastWithArgs Call old NativeDirectCall actuals updfr_off $ toCall f Nothing updfr_off 0
-
-mkCmmCall f results actuals = mkCall f (NativeDirectCall, NativeReturn) results actuals
-
--- I'm dropping the SRT, but that should be okay: we plan to reconstruct it later.
-mkCall f (callConv, retConv) results actuals updfr_off =
-  withFreshLabel "call successor" $ \k ->
-    let area = CallArea $ Young k
-        (off, copyin) = copyInOflow retConv area results
-        copyout = lastWithArgs Call area callConv actuals updfr_off 
-                               (toCall f (Just k) updfr_off off)
-    in (copyout <*> mkLabel k <*> copyin)
+  lastWithArgsAndExtraStack transfer area conv actuals
+                            updfr_off noExtraStack last
+
+lastWithArgsAndExtraStack :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset -> (ByteOff, [(CmmExpr,ByteOff)])
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgsAndExtraStack transfer area conv actuals updfr_off
+                          extra_stack last =
+  copies <*> last outArgs regs
+ where
+  (outArgs, regs, copies) = copyOutOflow conv transfer area actuals
+                               updfr_off extra_stack
+
+
+noExtraStack :: (ByteOff, [(CmmExpr,ByteOff)])
+noExtraStack = (0,[])
+
+toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff
+       -> ByteOff -> [GlobalReg]
+       -> CmmAGraph
+toCall e cont updfr_off res_space arg_space regs =
+  mkLast $ CmmCall e cont regs arg_space res_space updfr_off
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index fc4706c8c4..aa83afbf8d 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -9,9 +9,7 @@
 module OldCmm (
         CmmGroup, GenCmmGroup, RawCmmGroup, CmmDecl, RawCmmDecl,
         ListGraph(..),
-
-        CmmInfo(..), CmmInfoTable(..), ClosureTypeInfo(..), UpdateFrame(..),
-
+        CmmInfoTable(..), ClosureTypeInfo(..),
         CmmStatic(..), CmmStatics(..), CmmFormal, CmmActual,
 
         cmmMapGraph, cmmTopMapGraph,
@@ -49,24 +47,6 @@ import ForeignCall
 -- with assembly-language labels.
 
 -----------------------------------------------------------------------------
---     Info Tables
------------------------------------------------------------------------------
-
-data CmmInfo
-  = CmmInfo
-        (Maybe BlockId)     -- GC target. Nothing <=> CPS won't do stack check
-                            -- JD: NOT USED BY NEW CODE GEN
-        (Maybe UpdateFrame) -- Update frame
-        CmmInfoTable        -- Info table
-
--- | A frame that is to be pushed before entry to the function.
--- Used to handle 'update' frames.
-data UpdateFrame
-  = UpdateFrame
-        CmmExpr    -- Frame header.  Behaves like the target of a 'jump'.
-        [CmmExpr]  -- Frame remainder.  Behaves like the arguments of a 'jump'.
-
------------------------------------------------------------------------------
 --  Cmm, CmmDecl, CmmBasicBlock
 -----------------------------------------------------------------------------
 
@@ -85,8 +65,8 @@ data UpdateFrame
 newtype ListGraph i = ListGraph [GenBasicBlock i]
 
 -- | Cmm with the info table as a data type
-type CmmGroup = GenCmmGroup CmmStatics CmmInfo (ListGraph CmmStmt)
-type CmmDecl = GenCmmDecl CmmStatics CmmInfo (ListGraph CmmStmt)
+type CmmGroup = GenCmmGroup CmmStatics CmmInfoTable (ListGraph CmmStmt)
+type CmmDecl = GenCmmDecl CmmStatics CmmInfoTable (ListGraph CmmStmt)
 
 -- | Cmm with the info tables converted to a list of 'CmmStatic' along with the info
 -- table label. If we are building without tables-next-to-code there will be no statics
@@ -225,16 +205,9 @@ instance UserOfLocalRegs CmmCallTarget where
     foldRegsUsed f set (CmmCallee e _)    = foldRegsUsed f set e
     foldRegsUsed f set (CmmPrim _ mStmts) = foldRegsUsed f set mStmts
 
-instance UserOfSlots CmmCallTarget where
-    foldSlotsUsed f set (CmmCallee e _) = foldSlotsUsed f set e
-    foldSlotsUsed _ set (CmmPrim {})    = set
-
 instance UserOfLocalRegs a => UserOfLocalRegs (CmmHinted a) where
     foldRegsUsed f set a = foldRegsUsed f set (hintlessCmm a)
 
-instance UserOfSlots a => UserOfSlots (CmmHinted a) where
-    foldSlotsUsed f set a = foldSlotsUsed f set (hintlessCmm a)
-
 instance DefinerOfLocalRegs a => DefinerOfLocalRegs (CmmHinted a) where
     foldRegsDefd f set a = foldRegsDefd f set (hintlessCmm a)
 
diff --git a/compiler/cmm/OldCmmLint.hs b/compiler/cmm/OldCmmLint.hs
new file mode 100644
index 0000000000..72e40ce4f8
--- /dev/null
+++ b/compiler/cmm/OldCmmLint.hs
@@ -0,0 +1,209 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-- CmmLint: checking the correctness of Cmm statements and expressions
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS -fno-warn-tabs #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and
+-- detab the module (please do the detabbing in a separate patch). See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
+-- for details
+
+module OldCmmLint (
+  cmmLint, cmmLintTop
+  ) where
+
+import BlockId
+import OldCmm
+import CLabel
+import Outputable
+import OldPprCmm()
+import Constants
+import FastString
+import Platform
+
+import Data.Maybe
+
+-- -----------------------------------------------------------------------------
+-- Exported entry points:
+
+cmmLint :: (Outputable d, Outputable h)
+        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+
+cmmLintTop :: (Outputable d, Outputable h)
+           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+
+runCmmLint :: Outputable a
+           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint _ l p =
+   case unCL (l p) of
+   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                           nest 2 err,
+                           ptext $ sLit ("Program was:"),
+                           nest 2 (ppr p)])
+   Right _  -> Nothing
+
+lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
+lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
+  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
+        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
+	in  mapM_ (lintCmmBlock platform labels) blocks
+
+lintCmmDecl _ (CmmData {})
+  = return ()
+
+lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
+lintCmmBlock platform labels (BasicBlock id stmts)
+  = addLintInfo (text "in basic block " <> ppr id) $
+	mapM_ (lintCmmStmt platform labels) stmts
+
+-- -----------------------------------------------------------------------------
+-- lintCmmExpr
+
+-- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
+-- byte/word mismatches.
+
+lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
+lintCmmExpr platform (CmmLoad expr rep) = do
+  _ <- lintCmmExpr platform expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
+  return rep
+lintCmmExpr platform expr@(CmmMachOp op args) = do
+  tys <- mapM (lintCmmExpr platform) args
+  if map (typeWidth . cmmExprType) args == machOpArgReps op
+  	then cmmCheckMachOp op args tys
+	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr platform (CmmRegOff reg offset)
+  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
+		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+  where rep = typeWidth (cmmRegType reg)
+lintCmmExpr _ expr =
+  return (cmmExprType expr)
+
+-- Check for some common byte/word mismatches (eg. Sp + 1)
+cmmCheckMachOp   :: MachOp -> [CmmExpr] -> [CmmType] -> CmmLint CmmType
+cmmCheckMachOp op [lit@(CmmLit (CmmInt { })), reg@(CmmReg _)] tys
+  = cmmCheckMachOp op [reg, lit] tys
+cmmCheckMachOp op _ tys
+  = return (machOpResultType op tys)
+
+isOffsetOp :: MachOp -> Bool
+isOffsetOp (MO_Add _) = True
+isOffsetOp (MO_Sub _) = True
+isOffsetOp _ = False
+
+-- This expression should be an address from which a word can be loaded:
+-- check for funny-looking sub-word offsets.
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress _
+  = return ()
+
+-- No warnings for unaligned arithmetic with the node register,
+-- which is used to extract fields from tagged constructor closures.
+notNodeReg :: CmmExpr -> Bool
+notNodeReg (CmmReg reg) | reg == nodeReg = False
+notNodeReg _                             = True
+
+lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
+lintCmmStmt platform labels = lint
+    where lint (CmmNop) = return ()
+          lint (CmmComment {}) = return ()
+          lint stmt@(CmmAssign reg expr) = do
+            erep <- lintCmmExpr platform expr
+	    let reg_ty = cmmRegType reg
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr stmt erep reg_ty
+          lint (CmmStore l r) = do
+            _ <- lintCmmExpr platform l
+            _ <- lintCmmExpr platform r
+            return ()
+          lint (CmmCall target _res args _) =
+              do lintTarget platform labels target
+                 mapM_ (lintCmmExpr platform . hintlessCmm) args
+          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
+          lint (CmmSwitch e branches) = do
+            mapM_ checkTarget $ catMaybes branches
+            erep <- lintCmmExpr platform e
+            if (erep `cmmEqType_ignoring_ptrhood` bWord)
+              then return ()
+              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
+                               text " :: " <> ppr erep)
+          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
+          lint (CmmReturn) = return ()
+          lint (CmmBranch id)    = checkTarget id
+          checkTarget id = if setMember id labels then return ()
+                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
+lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
+                                                return ()
+lintTarget _        _      (CmmPrim _ Nothing) = return ()
+lintTarget platform labels (CmmPrim _ (Just stmts))
+    = mapM_ (lintCmmStmt platform labels) stmts
+
+
+checkCond :: CmmExpr -> CmmLint ()
+checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
+checkCond expr
+    = cmmLintErr (hang (text "expression is not a conditional:") 2
+                       (ppr expr))
+
+-- -----------------------------------------------------------------------------
+-- CmmLint monad
+
+-- just a basic error monad:
+
+newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
+
+instance Monad CmmLint where
+  CmmLint m >>= k = CmmLint $ case m of 
+				Left e -> Left e
+				Right a -> unCL (k a)
+  return a = CmmLint (Right a)
+
+cmmLintErr :: SDoc -> CmmLint a
+cmmLintErr msg = CmmLint (Left msg)
+
+addLintInfo :: SDoc -> CmmLint a -> CmmLint a
+addLintInfo info thing = CmmLint $ 
+   case unCL thing of
+	Left err -> Left (hang info 2 err)
+	Right a  -> Right a
+
+cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
+cmmLintMachOpErr expr argsRep opExpectsRep
+     = cmmLintErr (text "in MachOp application: " $$ 
+					nest 2 (ppr expr) $$
+				        (text "op is expecting: " <+> ppr opExpectsRep) $$
+					(text "arguments provide: " <+> ppr argsRep))
+
+cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr stmt e_ty r_ty
+  = cmmLintErr (text "in assignment: " $$ 
+		nest 2 (vcat [ppr stmt, 
+			      text "Reg ty:" <+> ppr r_ty,
+			      text "Rhs ty:" <+> ppr e_ty]))
+			 
+					
+
+cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
+cmmLintDubiousWordOffset expr
+   = cmmLintErr (text "offset is not a multiple of words: " $$
+			nest 2 (ppr expr))
diff --git a/compiler/cmm/OldPprCmm.hs b/compiler/cmm/OldPprCmm.hs
index d6a12221fb..9990fd26a4 100644
--- a/compiler/cmm/OldPprCmm.hs
+++ b/compiler/cmm/OldPprCmm.hs
@@ -63,10 +63,6 @@ instance Outputable instr => Outputable (GenBasicBlock instr) where
 instance Outputable CmmStmt where
     ppr s = pprStmt s
 
-instance Outputable CmmInfo where
-    ppr i = pprInfo i
-
-
 -- --------------------------------------------------------------------------
 instance Outputable CmmSafety where
   ppr CmmUnsafe = ptext (sLit "_unsafe_call_")
@@ -74,21 +70,6 @@ instance Outputable CmmSafety where
   ppr (CmmSafe srt) = ppr srt
 
 -- --------------------------------------------------------------------------
--- Info tables. The current pretty printer needs refinement
--- but will work for now.
---
--- For ideas on how to refine it, they used to be printed in the
--- style of C--'s 'stackdata' declaration, just inside the proc body,
--- and were labelled with the procedure name ++ "_info".
-pprInfo :: CmmInfo -> SDoc
-pprInfo (CmmInfo _gc_target update_frame info_table) =
-    vcat [{-ptext (sLit "gc_target: ") <>
-                maybe (ptext (sLit "<none>")) ppr gc_target,-}
-          ptext (sLit "update_frame: ") <>
-                maybe (ptext (sLit "<none>")) pprUpdateFrame update_frame,
-          ppr info_table]
-
--- --------------------------------------------------------------------------
 -- Basic blocks look like assembly blocks.
 --      lbl: stmt ; stmt ; ..
 pprBBlock :: Outputable stmt => GenBasicBlock stmt -> SDoc
@@ -156,18 +137,6 @@ pprStmt stmt = case stmt of
 instance (Outputable a) => Outputable (CmmHinted a) where
   ppr (CmmHinted a k) = ppr (a, k)
 
-pprUpdateFrame :: UpdateFrame -> SDoc
-pprUpdateFrame (UpdateFrame expr args) =
-    hcat [ ptext (sLit "jump")
-         , space
-         , if isTrivialCmmExpr expr
-                then pprExpr expr
-                else case expr of
-                    CmmLoad (CmmReg _) _ -> pprExpr expr
-                    _ -> parens (pprExpr expr)
-         , space
-         , parens  ( commafy $ map ppr args ) ]
-
 -- --------------------------------------------------------------------------
 -- goto local label. [1], section 6.6
 --
diff --git a/compiler/cmm/OptimizationFuel.hs b/compiler/cmm/OptimizationFuel.hs
deleted file mode 100644
index a85b11bcc6..0000000000
--- a/compiler/cmm/OptimizationFuel.hs
+++ /dev/null
@@ -1,142 +0,0 @@
-{-# LANGUAGE TypeFamilies #-}
--- | Optimisation fuel is used to control the amount of work the optimiser does.
---
--- Every optimisation step consumes a certain amount of fuel and stops when
--- it runs out of fuel.  This can be used e.g. to debug optimiser bugs: Run
--- the optimiser with varying amount of fuel to find out the exact number of
--- steps where a bug is introduced in the output.
-module OptimizationFuel
-    ( OptimizationFuel, amountOfFuel, tankFilledTo, unlimitedFuel, anyFuelLeft, oneLessFuel
-    , OptFuelState, initOptFuelState
-    , FuelConsumer, FuelUsingMonad, FuelState
-    , fuelGet, fuelSet, lastFuelPass, setFuelPass
-    , fuelExhausted, fuelDec1, tryWithFuel
-    , runFuelIO, runInfiniteFuelIO, fuelConsumingPass
-    , FuelUniqSM
-    , liftUniq
-    )
-where
-
-import Data.IORef
-import Control.Monad
-import StaticFlags (opt_Fuel)
-import UniqSupply
-import Panic
-import Util
-
-import Compiler.Hoopl
-import Compiler.Hoopl.GHC (getFuel, setFuel)
-
-#include "HsVersions.h"
-
-
--- We limit the number of transactions executed using a record of flags
--- stored in an HscEnv. The flags store the name of the last optimization
--- pass and the amount of optimization fuel remaining.
-data OptFuelState =
-  OptFuelState { pass_ref :: IORef String
-               , fuel_ref :: IORef OptimizationFuel
-               }
-initOptFuelState :: IO OptFuelState
-initOptFuelState =
-  do pass_ref' <- newIORef "unoptimized program"
-     fuel_ref' <- newIORef (tankFilledTo opt_Fuel)
-     return OptFuelState {pass_ref = pass_ref', fuel_ref = fuel_ref'}
-
-type FuelConsumer a = OptimizationFuel -> (a, OptimizationFuel)
-
-tankFilledTo :: Int -> OptimizationFuel
-amountOfFuel :: OptimizationFuel -> Int
-
-anyFuelLeft :: OptimizationFuel -> Bool
-oneLessFuel :: OptimizationFuel -> OptimizationFuel
-unlimitedFuel :: OptimizationFuel
-
-newtype OptimizationFuel = OptimizationFuel Int
-  deriving Show
-
-tankFilledTo = OptimizationFuel
-amountOfFuel (OptimizationFuel f) = f
-
-anyFuelLeft (OptimizationFuel f) = f > 0
-oneLessFuel (OptimizationFuel f) = ASSERT (f > 0) (OptimizationFuel (f - 1))
-unlimitedFuel = OptimizationFuel infiniteFuel
-
-data FuelState = FuelState { fs_fuel :: OptimizationFuel, fs_lastpass :: String }
-newtype FuelUniqSM a = FUSM { unFUSM :: FuelState -> UniqSM (a, FuelState) }
-
-fuelConsumingPass :: String -> FuelConsumer a -> FuelUniqSM a
-fuelConsumingPass name f = do setFuelPass name
-                              fuel <- fuelGet
-                              let (a, fuel') = f fuel
-                              fuelSet fuel'
-                              return a
-
-runFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       fuel <- readIORef (fuel_ref fs)
-       u    <- mkSplitUniqSupply 'u'
-       let (a, FuelState fuel' pass') = initUs_ u $ f (FuelState fuel pass)
-       writeIORef (pass_ref fs) pass'
-       writeIORef (fuel_ref fs) fuel'
-       return a
-
--- ToDo: Do we need the pass_ref when we are doing infinite fueld
--- transformations?
-runInfiniteFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runInfiniteFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       u <- mkSplitUniqSupply 'u'
-       let (a, FuelState _ pass') = initUs_ u $ f (FuelState unlimitedFuel pass)
-       writeIORef (pass_ref fs) pass'
-       return a
-
-instance Monad FuelUniqSM where
-  FUSM f >>= k = FUSM (\s -> f s >>= \(a, s') -> unFUSM (k a) s')
-  return a     = FUSM (\s -> return (a, s))
-
-instance MonadUnique FuelUniqSM where
-    getUniqueSupplyM = liftUniq getUniqueSupplyM
-    getUniqueM       = liftUniq getUniqueM
-    getUniquesM      = liftUniq getUniquesM
-
-liftUniq :: UniqSM x -> FuelUniqSM x
-liftUniq x = FUSM (\s -> x >>= (\u -> return (u, s)))
-
-class Monad m => FuelUsingMonad m where
-  fuelGet      :: m OptimizationFuel
-  fuelSet      :: OptimizationFuel -> m ()
-  lastFuelPass :: m String
-  setFuelPass  :: String -> m ()
-
-fuelExhausted :: FuelUsingMonad m => m Bool
-fuelExhausted = fuelGet >>= return . anyFuelLeft
-
-fuelDec1 :: FuelUsingMonad m => m ()
-fuelDec1 = fuelGet >>= fuelSet . oneLessFuel
-
-tryWithFuel :: FuelUsingMonad m => a -> m (Maybe a)
-tryWithFuel r = do f <- fuelGet
-                   if anyFuelLeft f then fuelSet (oneLessFuel f) >> return (Just r)
-                                    else return Nothing
-
-instance FuelUsingMonad FuelUniqSM where
-  fuelGet          = extract fs_fuel
-  lastFuelPass     = extract fs_lastpass
-  fuelSet fuel     = FUSM (\s -> return ((), s { fs_fuel     = fuel }))
-  setFuelPass pass = FUSM (\s -> return ((), s { fs_lastpass = pass }))
-
-extract :: (FuelState -> a) -> FuelUniqSM a
-extract f = FUSM (\s -> return (f s, s))
-
-instance FuelMonad FuelUniqSM where
-  getFuel = liftM amountOfFuel fuelGet
-  setFuel = fuelSet . tankFilledTo
-
--- Don't bother to checkpoint the unique supply; it doesn't matter
-instance CheckpointMonad FuelUniqSM where
-    type Checkpoint FuelUniqSM = FuelState
-    checkpoint = FUSM $ \fuel -> return (fuel, fuel) 
-    restart fuel = FUSM $ \_ -> return ((), fuel)
-
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 183708c08e..9717eea179 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -146,8 +146,6 @@ pprConvention  Slow                 = text "<slow-convention>"
 pprConvention  GC                   = text "<gc-convention>"
 pprConvention  PrimOpCall           = text "<primop-call-convention>"
 pprConvention  PrimOpReturn         = text "<primop-ret-convention>"
-pprConvention (Foreign c)           = ppr c
-pprConvention (Private {})          = text "<private-convention>"
 
 pprForeignConvention :: ForeignConvention -> SDoc
 pprForeignConvention (ForeignConvention c as rs) = ppr c <> ppr as <> ppr rs
@@ -229,9 +227,9 @@ pprNode node = pp_node <+> pp_debug
                                      , ptext (sLit ": goto")
                                      , ppr (head [ id | Just id <- ids]) <> semi ]
 
-      CmmCall tgt k out res updfr_off ->
+      CmmCall tgt k regs out res updfr_off ->
           hcat [ ptext (sLit "call"), space
-               , pprFun tgt, ptext (sLit "(...)"), space
+               , pprFun tgt, parens (interpp'SP regs), space
                , ptext (sLit "returns to") <+> ppr k <+> parens (ppr out)
                                                      <+> parens (ppr res)
                , ptext (sLit " with update frame") <+> ppr updfr_off
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 7503127555..119f2b7239 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -237,12 +237,8 @@ pprLocalReg (LocalReg uniq rep)
 
 -- Stack areas
 pprArea :: Area -> SDoc
-pprArea (RegSlot r)   = hcat [ text "slot<", ppr r, text ">" ]
-pprArea (CallArea id) = pprAreaId id
-
-pprAreaId :: AreaId -> SDoc
-pprAreaId Old        = text "old"
-pprAreaId (Young id) = hcat [ text "young<", ppr id, text ">" ]
+pprArea Old        = text "old"
+pprArea (Young id) = hcat [ text "young<", ppr id, text ">" ]
 
 -- needs to be kept in syn with CmmExpr.hs.GlobalReg
 --
diff --git a/compiler/cmm/SMRep.lhs b/compiler/cmm/SMRep.lhs
index ce30799bf6..8b3308ef97 100644
--- a/compiler/cmm/SMRep.lhs
+++ b/compiler/cmm/SMRep.lhs
@@ -21,6 +21,7 @@ module SMRep (
 	StgWord, StgHalfWord, 
 	hALF_WORD_SIZE, hALF_WORD_SIZE_IN_BITS,
 	WordOff, ByteOff,
+        roundUpToWords,
 
         -- * Closure repesentation
         SMRep(..),	-- CmmInfo sees the rep; no one else does
@@ -57,6 +58,7 @@ import FastString
 
 import Data.Char( ord )
 import Data.Word
+import Data.Bits
 \end{code}
 
 
@@ -69,6 +71,9 @@ import Data.Word
 \begin{code}
 type WordOff = Int	-- Word offset, or word count
 type ByteOff = Int	-- Byte offset, or byte count
+
+roundUpToWords :: ByteOff -> ByteOff
+roundUpToWords n = (n + (wORD_SIZE - 1)) .&. (complement (wORD_SIZE - 1))
 \end{code}
 
 StgWord is a type representing an StgWord on the target platform.
@@ -93,6 +98,7 @@ hALF_WORD_SIZE_IN_BITS = 32
 #endif
 \end{code}
 
+
 %************************************************************************
 %*									*
 \subsubsection[SMRep-datatype]{@SMRep@---storage manager representation}
diff --git a/compiler/cmm/cmm-notes b/compiler/cmm/cmm-notes
index 81882c8c0e..7f7107a18d 100644
--- a/compiler/cmm/cmm-notes
+++ b/compiler/cmm/cmm-notes
@@ -24,32 +24,6 @@ More notes (June 11)
 * Check in ClosureInfo:
      -- NB: Results here should line up with the results of SMRep.rtsClosureType
 
-* Possible refactoring: Nuke AGraph in favour of 
-      mkIfThenElse :: Expr -> Graph -> Graph -> FCode Graph
-  or even
-      mkIfThenElse :: HasUniques m => Expr -> Graph -> Graph -> m Graph
-  (Remmber that the .cmm file parser must use this function)
-
-  or parameterise FCode over its envt; the CgState part seem useful for both
-
-* "Remove redundant reloads" in CmmSpillReload should be redundant; since
-  insertLateReloads is now gone, every reload is reloading a live variable.
-  Test and nuke.
-
-* Stack layout is very like register assignment: find non-conflicting assigments.
-  In particular we can use colouring or linear scan (etc).
-
-  We'd fine-grain interference (on a word by word basis) to get maximum overlap.
-  But that may make very big interference graphs.  So linear scan might be
-  more attactive.
-
-  NB: linear scan does on-the-fly live range splitting.
-
-* When stubbing dead slots be careful not to write into an area that
-  overlaps with an area that's in use.  So stubbing needs to *follow* 
-  stack layout.
-
-
 More notes (May 11)
 ~~~~~~~~~~~~~~~~~~~
 In CmmNode, consider spliting CmmCall into two: call and jump
@@ -58,81 +32,16 @@ Notes on new codegen (Aug 10)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Things to do:
- - We insert spills for variables before the stack check! This is the reason for
-   some fishy code in StgCmmHeap.entryHeapCheck where we are doing some strange
-	things to fix up the stack pointer before GC calls/jumps.
-
-	The reason spills are inserted before the sp check is that at the entry to a
-	function we always store the parameters passed in registers to local variables.
-	The spill pass simply inserts spills at variable definitions. We instead should
-	sink the spills so that we can avoid spilling them on branches that never
-	reload them.
-
-	This will fix the spill before stack check problem but only really as a side
-	effect. A 'real fix' probably requires making the spiller know about sp checks.
-
-   EZY: I don't understand this comment. David Terei, can you clarify?
-
  - Proc points pass all arguments on the stack, adding more code and
    slowing down things a lot. We either need to fix this or even better
    would be to get rid of proc points.
 
- - CmmInfo.cmmToRawCmm uses Old.Cmm, so it is called after converting Cmm.Cmm to
-   Old.Cmm. We should abstract it to work on both representations, it needs only to
-   convert a CmmInfoTable to [CmmStatic].
-
- - The MkGraph currenty uses a different semantics for <*> than Hoopl. Maybe
-   we could convert codeGen/StgCmm* clients to the Hoopl's semantics?
-   It's all deeply unsatisfactory.
-
- - Improve performance of Hoopl.
-
-   A nofib comparison of -fasm vs -fnewcodegen nofib compilation parameters
-   (using the same ghc-cmm branch +libraries compiled by the old codegenerator)
-   is at http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.oldghchoopl.txt
-   - the code produced is 10.9% slower, the compilation is +118% slower!
-
-   The same comparison with ghc-head with zip representation is at
-   http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.oldghczip.txt
-   - the code produced is 11.7% slower, the compilation is +78% slower.
-
-   When compiling nofib, ghc-cmm + libraries compiled with -fnew-codegen
-   is 23.7% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.hooplghcoldgen.txt).
-   When compiling nofib, ghc-head + libraries compiled with -fnew-codegen
-   is 31.4% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.zipghcoldgen.txt).
-
-   So we generate a bit better code, but it takes us longer!
-
-   EZY: Also importantly, Hoopl uses dramatically more memory than the
-   old code generator.
-
- - Are all blockToNodeList and blockOfNodeList really needed? Maybe we could
-   splice blocks instead?
-
-   In the CmmContFlowOpt.blockConcat, using Dataflow seems too clumsy. Still,
-   a block catenation function would be probably nicer than blockToNodeList
-   / blockOfNodeList combo.
-
- - lowerSafeForeignCall seems too lowlevel. Just use Dataflow. After that
-   delete splitEntrySeq from HooplUtils.
-
- - manifestSP seems to touch a lot of the graph representation. It is
-   also slow for CmmSwitch nodes O(block_nodes * switch_statements).
-   Maybe rewrite manifestSP to use Dataflow?
-
  - Sort out Label, LabelMap, LabelSet versus BlockId, BlockEnv, BlockSet
    dichotomy. Mostly this means global replace, but we also need to make
    Label an instance of Outputable (probably in the Outputable module).
 
    EZY: We should use Label, since that's the terminology Hoopl uses.
 
- - NB that CmmProcPoint line 283 has a hack that works around a GADT-related
-   bug in 6.10.
-
- - SDM (2010-02-26) can we remove the Foreign constructor from Convention?
-   Reason: we never generate code for a function with the Foreign
-   calling convention, and the code for calling foreign calls is generated
-
  - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
    EZY (2011-04-16): The mini-inliner has been generalized and ported,
    but the constant folding and other optimizations need to still be
@@ -146,100 +55,25 @@ Things to do:
 	new native codegen, much in the way that we lower calling conventions.
 	Might need to be a bit sophisticated about aliasing.
 
- - Question: currently we lift procpoints to become separate
-   CmmProcs.  Do we still want to do this?
-    
-   NB: and advantage of continuing to do this is that
-   we can do common-proc elimination!
-
  - Move to new Cmm rep:
      * Make native CG consume New Cmm; 
      * Convert Old Cmm->New Cmm to keep old path alive
      * Produce New Cmm when reading in .cmm files
 
- - Consider module names
-
  - Top-level SRT threading is a bit ugly
 
- - Add type/newtype for CmmModule = [CmmGroup]    -- A module
-                        CmmGroup  = [CmmTop]      -- A .o file
-                        CmmTop    = Proc | Data   -- A procedure or data
-
- - This is a *change*: currently a CmmGroup is one function's-worth of code
-   regardless of SplitObjs.   Question: can we *always* generate M.o if there
-   is just one element in the list (rather than M/M1.o, M/M2.o etc)
-
-   One SRT per group.
-
  - See "CAFs" below; we want to totally refactor the way SRTs are calculated
 
- - Pull out Areas into its own module
-   Parameterise AreaMap (note there are type synonyms in CmmStackLayout!)
-   Add ByteWidth = Int
-   type SubArea    = (Area, ByteOff, ByteWidth) 
-   ByteOff should not be defined in SMRep -- that is too high up the hierarchy
-   
- - SMRep should not be imported by any module in cmm/!  Make it so.
-	-- ByteOff etc   ==>  CmmExpr
-        -- rET_SMALL etc ==> CmmInfo
-   Check that there are no other imports from codeGen in cmm/
-
- - If you eliminate a label by branch chain elimination,
-   what happens if there's an Area associated with that label?
-
- - Think about a non-flattened representation?
-
- - LastCall: 
-    * Use record fields for LastCall!
-    * cml_ret_off should be a ByteOff
-    * Split into 
-         LastCall (which has a successor) and
-         LastJump (which does not, includes return?)
-	   - does not have cml_cont, cml_ret_args, cml_ret_off
-	 LastForeignCall 
-           - safe! 
-           - expands into save/MidForeignCall/restore/goto
-	   - like any LastCall, target of the call gets an info table
-
- - JD: remind self of what goes wrong if you turn off the 
-   liveness of the update frame
-
  - Garbage-collect http://hackage.haskell.org/trac/ghc/wiki/Commentary/Compiler/CPS
    moving good stuff into 
    http://hackage.haskell.org/trac/ghc/wiki/Commentary/Compiler/NewCodeGenPipeline
 
-
- - We believe that all of CmmProcPoint.addProcPointProtocols is dead.  What
-   goes wrong if we simply never call it?
-
- - Something fishy in CmmStackLayout.hs
-   * In particular, 'getAreaSize' returns an AreaMap, but we *know* the width of
-	LocalRegs, so it'd be better to return FiniteMap AreaId ByteWidth
-   * setSuccSPs looks fishy.  Rather than lookin in procPoints, it could
-	just lookup the block in areaSize which, after all, has a binding
-	for precisely successors of calls.  All other blocks (including proc
-        points that are not successors of a call, we think) can be treated
-        uniformly: zero-size Area, and use inSP.
-
-
  - Currently AsmCodeGen top level calls AsmCodeGen.cmmToCmm, which is a small
    C-- optimiser.  It has quite a lot of boilerplate folding code in AsmCodeGen
    (cmmBlockConFold, cmmStmtConFold, cmmExprConFold), before calling out to
    CmmOpt.  ToDo: see what optimisations are being done; and do them before
    AsmCodeGen.
 
- - Modularise the CPS pipeline; instead of ...; A;B;C; ...
-                                use  ..; ABC; ....
-
- - Most of HscMain.tryNewCodeGen does not belong in HscMain.  Instead
-	if new_cg then
-             StgCmm.codeGen
-             processCmm  [including generating "raw" cmm]
-        else
-             CodeGen.codeGen
-             cmmToRawCmm
-
-
  - If we stick CAF and stack liveness info on a LastCall node (not LastRet/Jump)
    then all CAF and stack liveness stuff be completed before we split
    into separate C procedures.
@@ -312,9 +146,6 @@ ClosureInfo.lhs
 	Modules in cmm/
 ----------------------------------------------------
 
--------- Testing stuff ------------
-DynFlags: -frun-cpsz
-
 -------- Moribund stuff ------------
 OldCmm.hs      Definition of flowgraph of old representation
                Imports some data types from (new) Cmm
@@ -357,93 +188,6 @@ PprC.hs	            Pretty print Cmm in C syntax
 CLabel.hs           CLabel
 BlockId.hs          BlockId, BlockEnv, BlockSet
 
-----------------------------------------------------
-      Top-level structure
-----------------------------------------------------
-
-* New codgen called in HscMain.hscGenHardCode, by calling HscMain.tryNewCodeGen, 
-  enabled by -fnew-codegen (Opt_TryNewCodeGen)
-
-  THEN it calls CmmInfo.cmmToRawCmm to lay out the details of info tables
-      type Cmm    = GenCmm CmmStatic CmmInfo     (ListGraph CmmStmt)
-      type RawCmm = GenCmm CmmStatic [CmmStatic] (ListGraph CmmStmt)
-
-* HscMain.tryNewCodeGen
-    - STG->Cmm:         StgCmm.codeGen (new codegen)
-    - Optimize and CPS: CmmPipeline.cmmPipeline
-    - Convert:          CmmCvt.cmmOfZgraph (convert to old rep) very self contained
-
-* StgCmm.hs  The new STG -> Cmm conversion code generator
-  Lots of modules StgCmmXXX
-
-
-----------------------------------------------------
-      CmmPipeline.cmmPipeline   The new pipeline
-----------------------------------------------------
-
-CmmPipeline.cmmPipeline:
-   1. Do control flow optimization
-   2. Do cpsTop for each procedures separately
-   3. Build SRT representation; this spans multiple procedures
-        (unless split-objs)
-   4. Do control flow optimization on all resulting procedures
-
-cpsTop:
-  * CmmCommonBlockElim.elimCommonBlocks:
-	eliminate common blocks 
-
-  * CmmProcPoint.minimalProcPointSet
-	identify proc-points
-        no change to graph
-
-  * CmmProcPoint.addProcPointProtocols
-	something to do with the MA optimisation
-        probably entirely unnecessary
-
-  * Spill and reload:
-     - CmmSpillReload.dualLivenessWithInsertion
-       insert spills/reloads across 
-	   LastCalls, and 
-	   Branches to proc-points
-     Now sink those reloads (and other instructions):
-     - CmmSpillReload.rewriteAssignments
-     - CmmSpillReload.removeDeadAssignmentsAndReloads
-
-  * CmmStackLayout.stubSlotsOnDeath
-	debug only: zero out dead slots when they die
-
-  * Stack layout
-     - CmmStackLayout.lifeSlotAnal: 
-       find which sub-areas are live on entry to each block
-
-     - CmmStackLayout.layout
-       Lay out the stack, returning an AreaMap
-         type AreaMap = FiniteMap Area ByteOff
-          -- Byte offset of the oldest byte of the Area, 
-          -- relative to the oldest byte of the Old Area
-
-     - CmmStackLayout.manifestSP
-       Manifest the stack pointer
-
-   * Split into separate procedures
-
-      - CmmProcPoint.procPointAnalysis
-        Given set of proc points (computed earlier by
-           CmmProcPoint.minimalProcPointSet) find which blocks 
-           are reachable from each
-	Each block should be reachable from *one* proc point, so 
-	  the blocks reachable from P are the internal nodes of 
-	  the final procedure P
-        NB: if we the earlier analysis had produced too few proc-points 
-            we should nevertheless be fine by code duplication; but
-	    that is not implemented
-
-      - CmmProcPoint.splitAtProcPoints
-	Using this info, split into separate procedures
-
-      - CmmBuildInfoTables.setInfoTableStackMap
-	Attach stack maps to each info table
-
 
 ----------------------------------------------------
 	Proc-points
@@ -539,116 +283,3 @@ a dominator analysis, using the Dataflow Engine.
 * DECIDED: we can generate SRTs based on the final Cmm program
   without knowledge of how it is generated.
 
-----------------------------------------------------
-		Foreign calls
-----------------------------------------------------
-
-See Note [Foreign calls] in CmmNode!  This explains that a safe
-foreign call must do this:
-  save thread state
-  push info table (on thread stack) to describe frame
-  make call (via C stack)
-  pop info table
-  restore thread state
-and explains why this expansion must be done late in the day.
-
-Hence, 
-  - Every foreign call is represented as a middle node
-
-  - *Unsafe* foreign calls are simply "fat machine instructions"
-      and are passed along to the native code generator
-
-  - *Safe* foreign calls are "lowered" to unsafe calls by wrapping
-      them in the above save/restore sequence. This step is done
-      very late in the pipeline, just before handing to the native
-      code gen.   
-  
-      This lowering is done by BuildInfoTables.lowerSafeForeignCalls
-
-
-NEW PLAN for foreign calls:
-  - Unsafe foreign calls remain as a middle node (fat machine instruction)
-    Even the parameter passing is not lowered (just as machine instrs
-    get arguments).
-
-  - Initially, safe foreign calls appear as LastCalls with 
-	
-
-----------------------------------------------------
-		Cmm representations
-----------------------------------------------------
-
-* CmmDecl.hs
-     The type [GenCmm d h g] represents a whole module, 
-	** one list element per .o file **
-	Without SplitObjs, the list has exactly one element
-
-     newtype GenCmm d h g = Cmm [GenCmmTop d h g]  -- A whole .o file
-     data GenCmmTop d h g
-         = CmmProc h g           -- One procedure, graph d
-         | CmmData <stuff> [d]   -- Initialised data, items d
-
-  Old and new piplines use different representations
-  	(CmmCvt.hs converts between the two)
-
-
--------------
-OLD BACK END representations (OldCmm.hs):  
-      type Cmm = GenCmm CmmStatic CmmInfo (ListGraph CmmStmt)
-				-- A whole module
-      newtype ListGraph i = ListGraph [GenBasicBlock i]
-
-      data CmmStmt = Assign | Store | Return etc -- OLD BACK END ONLY
-
-
-   Once the info tables are laid out, we replace CmmInfo with [CmmStatic]
-      type RawCmm    = GenCmm CmmStatic [CmmStatic] (ListGraph CmmStmt)
-   which represents the info tables as data, that should 
-   immediately precede the code
-  
--------------
-NEW BACK END representations 
-* Uses Hoopl library, a zero-boot package
-* CmmNode defines a node of a flow graph.
-* Cmm defines CmmGraph, CmmTop, Cmm
-   - CmmGraph is a closed/closed graph + an entry node.
-
-       data CmmGraph = CmmGraph { g_entry :: BlockId
-                                , g_graph :: Graph CmmNode C C }
-
-   - CmmTop is a top level chunk, specialization of GenCmmTop from CmmDecl.hs
-       with CmmGraph as a flow graph.
-   - Cmm is a collection of CmmTops.
-
-       type Cmm          = GenCmm    CmmStatic CmmTopInfo CmmGraph
-       type CmmTop       = GenCmmTop CmmStatic CmmTopInfo CmmGraph
-
-   - CmmTop uses CmmTopInfo, which is a CmmInfoTable and CmmStackInfo
-
-       data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
-
-   - CmmStackInfo
-
-       data CmmStackInfo = StackInfo {arg_space :: ByteOff, updfr_space :: Maybe ByteOff}
-
-         * arg_space = SP offset on entry
-         * updfr_space space = SP offset on exit
-       Once the staci is manifested, we could drom CmmStackInfo, ie. get
-         GenCmm CmmStatic CmmInfoTable CmmGraph, but we do not do that currently.
-
-
-* MkGraph.hs: smart constructors for Cmm.hs
-  Beware, the CmmAGraph defined here does not use AGraph from Hoopl,
-  as CmmAGraph can be opened or closed at exit, See the notes in that module.
-
--------------
-* SHARED stuff
-  CmmDecl.hs - GenCmm and GenCmmTop types
-  CmmExpr.hs - defines the Cmm expression types
-             - CmmExpr, CmmReg, CmmLit, LocalReg, GlobalReg
-             - Area, AreaId etc     (separate module?)
-  CmmType.hs - CmmType, Width etc   (saparate module?)
-  CmmMachOp.hs - MachOp and CallishMachOp types
-
-  BlockId.hs defines  BlockId, BlockEnv, BlockSet
--------------
author	Simon Peyton Jones <simonpj@microsoft.com>	2012-07-10 16:18:54 +0100
committer	Simon Peyton Jones <simonpj@microsoft.com>	2012-07-10 16:18:54 +0100
commit	b8bfab8076f1d8fd5ed1d634fee82fd4d2fc0ed8 (patch)
tree	e736c4fd8a77773e081e3987d4c17bb2a9e7c98e /compiler/cmm
parent	c548f91feddf149ee4d3358483828f2d4c0ec41b (diff)
parent	713cf473de8a2ad7d0b8195d78860c25fec41839 (diff)
download	haskell-b8bfab8076f1d8fd5ed1d634fee82fd4d2fc0ed8.tar.gz