Merge branch 'master' of http://darcs.haskell.org/ghc

Conflicts: compiler/typecheck/TcRnDriver.lhs
author: Simon Peyton Jones <simonpj@microsoft.com> 2012-07-10 16:04:30 +0100
committer: Simon Peyton Jones <simonpj@microsoft.com> 2012-07-10 16:04:30 +0100
commit: 8bbdab1852beda96213ab18d228eb7a1002cedb6 (patch)
tree: 3acf0197801b2b9c259149e90ef5e1a7f5f47154
parent: 3fe3ef509627cb8c9aa7751beb44d1b4408f8b22 (diff)
parent: 713cf473de8a2ad7d0b8195d78860c25fec41839 (diff)
download: haskell-8bbdab1852beda96213ab18d228eb7a1002cedb6.tar.gz
140 files changed, 5625 insertions, 4189 deletions
diff --git a/compiler/basicTypes/Id.lhs b/compiler/basicTypes/Id.lhs
index e6e221bfce..ec63b893e9 100644
--- a/compiler/basicTypes/Id.lhs
+++ b/compiler/basicTypes/Id.lhs
@@ -65,7 +65,7 @@ module Id (
         hasNoBinding,
 
 	-- ** Evidence variables
-	DictId, isDictId, isEvVar,
+	DictId, isDictId, dfunNSilent, isEvVar,
 
 	-- ** Inline pragma stuff
 	idInlinePragma, setInlinePragma, modifyInlinePragma,
@@ -118,7 +118,7 @@ import Demand
 import Name
 import Module
 import Class
-import PrimOp
+import {-# SOURCE #-} PrimOp (PrimOp)
 import ForeignCall
 import Maybes
 import SrcLoc
@@ -342,6 +342,11 @@ isDFunId id = case Var.idDetails id of
                         DFunId {} -> True
                         _         -> False
 
+dfunNSilent :: Id -> Int
+dfunNSilent id = case Var.idDetails id of
+                   DFunId ns _ -> ns
+                   _ -> pprPanic "dfunSilent: not a dfun:" (ppr id)
+
 isPrimOpId_maybe id = case Var.idDetails id of
                         PrimOpId op -> Just op
                         _           -> Nothing
diff --git a/compiler/basicTypes/IdInfo.lhs b/compiler/basicTypes/IdInfo.lhs
index 3f5eaa4b5a..93762abba9 100644
--- a/compiler/basicTypes/IdInfo.lhs
+++ b/compiler/basicTypes/IdInfo.lhs
@@ -75,7 +75,7 @@ module IdInfo (
 import CoreSyn
 
 import Class
-import PrimOp
+import {-# SOURCE #-} PrimOp (PrimOp)
 import Name
 import VarSet
 import BasicTypes
@@ -136,7 +136,14 @@ data IdDetails
 
   | TickBoxOpId TickBoxOp	-- ^ The 'Id' is for a HPC tick box (both traditional and binary)
 
-  | DFunId Bool                 -- ^ A dictionary function.
+  | DFunId Int Bool             -- ^ A dictionary function.
+       -- Int = the number of "silent" arguments to the dfun
+       --       e.g.  class D a => C a where ...
+       --             instance C a => C [a]
+       --       has is_silent = 1, because the dfun
+       --       has type  dfun :: (D a, C a) => C [a]
+       --       See the DFun Superclass Invariant in TcInstDcls
+       --
        -- Bool = True <=> the class has only one method, so may be
        --                  implemented with a newtype, so it might be bad
        --                  to be strict on this dictionary
@@ -158,7 +165,8 @@ pprIdDetails other     = brackets (pp other)
    pp (PrimOpId _)      = ptext (sLit "PrimOp")
    pp (FCallId _)       = ptext (sLit "ForeignCall")
    pp (TickBoxOpId _)   = ptext (sLit "TickBoxOp")
-   pp (DFunId nt)       = ptext (sLit "DFunId")
+   pp (DFunId ns nt)    = ptext (sLit "DFunId")
+                             <> ppWhen (ns /= 0) (brackets (int ns))
                              <> ppWhen nt (ptext (sLit "(nt)"))
    pp (RecSelId { sel_naughty = is_naughty })
       			 = brackets $ ptext (sLit "RecSel") 
diff --git a/compiler/basicTypes/MkId.lhs b/compiler/basicTypes/MkId.lhs
index 3eaa7dceb5..c1127da18f 100644
--- a/compiler/basicTypes/MkId.lhs
+++ b/compiler/basicTypes/MkId.lhs
@@ -826,17 +826,29 @@ mkDictFunId :: Name      -- Name to use for the dict fun;
 -- Implements the DFun Superclass Invariant (see TcInstDcls)
 
 mkDictFunId dfun_name tvs theta clas tys
-  = mkExportedLocalVar (DFunId is_nt)
+  = mkExportedLocalVar (DFunId n_silent is_nt)
                        dfun_name
                        dfun_ty
                        vanillaIdInfo
   where
     is_nt = isNewTyCon (classTyCon clas)
-    dfun_ty = mkDictFunTy tvs theta clas tys
+    (n_silent, dfun_ty) = mkDictFunTy tvs theta clas tys
 
-mkDictFunTy :: [TyVar] -> ThetaType -> Class -> [Type] -> Type
+mkDictFunTy :: [TyVar] -> ThetaType -> Class -> [Type] -> (Int, Type)
 mkDictFunTy tvs theta clas tys
-  = mkSigmaTy tvs theta (mkClassPred clas tys)
+  = (length silent_theta, dfun_ty)
+  where
+    dfun_ty = mkSigmaTy tvs (silent_theta ++ theta) (mkClassPred clas tys)
+    silent_theta 
+      | null tvs, null theta 
+      = []
+      | otherwise
+      = filterOut discard $
+        substTheta (zipTopTvSubst (classTyVars clas) tys)
+                   (classSCTheta clas)
+                   -- See Note [Silent Superclass Arguments]
+    discard pred = any (`eqPred` pred) theta
+                 -- See the DFun Superclass Invariant in TcInstDcls
 \end{code}
 
 
diff --git a/compiler/basicTypes/MkId.lhs-boot b/compiler/basicTypes/MkId.lhs-boot
index 4f9615a061..7891e65d7f 100644
--- a/compiler/basicTypes/MkId.lhs-boot
+++ b/compiler/basicTypes/MkId.lhs-boot
@@ -2,8 +2,11 @@
 module MkId where
 import Name( Name )
 import DataCon( DataCon, DataConIds )
+import {-# SOURCE #-} PrimOp( PrimOp )
+import Id( Id )
 
 mkDataConIds :: Name -> Name -> DataCon -> DataConIds
+mkPrimOpId :: PrimOp -> Id
 \end{code}
 
 
diff --git a/compiler/basicTypes/SrcLoc.lhs b/compiler/basicTypes/SrcLoc.lhs
index a7399abf5c..2c008f55d8 100644
--- a/compiler/basicTypes/SrcLoc.lhs
+++ b/compiler/basicTypes/SrcLoc.lhs
@@ -82,6 +82,8 @@ import FastString
 
 import Data.Bits
 import Data.Data
+import Data.List
+import Data.Ord
 import System.FilePath
 \end{code}
 
@@ -176,9 +178,7 @@ instance Ord RealSrcLoc where
   compare = cmpRealSrcLoc
 
 sortLocated :: [Located a] -> [Located a]
-sortLocated things = sortLe le things
-  where
-    le (L l1 _) (L l2 _) = l1 <= l2
+sortLocated things = sortBy (comparing getLoc) things
 
 cmpSrcLoc :: SrcLoc -> SrcLoc -> Ordering
 cmpSrcLoc (UnhelpfulLoc s1) (UnhelpfulLoc s2) = s1 `compare` s2
diff --git a/compiler/basicTypes/UniqSupply.lhs b/compiler/basicTypes/UniqSupply.lhs
index bb40be7ac1..f3fb28ac21 100644
--- a/compiler/basicTypes/UniqSupply.lhs
+++ b/compiler/basicTypes/UniqSupply.lhs
@@ -109,7 +109,7 @@ takeUniqFromSupply (MkSplitUniqSupply n s1 _) = (mkUniqueGrimily (iBox n), s1)
 
 \begin{code}
 -- | A monad which just gives the ability to obtain 'Unique's
-newtype UniqSM result = USM { unUSM :: UniqSupply -> (result, UniqSupply) }
+newtype UniqSM result = USM { unUSM :: UniqSupply -> (# result, UniqSupply #) }
 
 instance Monad UniqSM where
   return = returnUs
@@ -118,21 +118,21 @@ instance Monad UniqSM where
 
 instance Functor UniqSM where
     fmap f (USM x) = USM (\us -> case x us of
-                                 (r, us') -> (f r, us'))
+                                 (# r, us' #) -> (# f r, us' #))
 
 instance Applicative UniqSM where
     pure = returnUs
     (USM f) <*> (USM x) = USM $ \us -> case f us of
-                            (ff, us')  -> case x us' of
-                              (xx, us'') -> (ff xx, us'')
+                            (# ff, us' #)  -> case x us' of
+                              (# xx, us'' #) -> (# ff xx, us'' #)
 
 -- | Run the 'UniqSM' action, returning the final 'UniqSupply'
 initUs :: UniqSupply -> UniqSM a -> (a, UniqSupply)
-initUs init_us m = case unUSM m init_us of { (r,us) -> (r,us) }
+initUs init_us m = case unUSM m init_us of { (# r, us #) -> (r,us) }
 
 -- | Run the 'UniqSM' action, discarding the final 'UniqSupply'
 initUs_ :: UniqSupply -> UniqSM a -> a
-initUs_ init_us m = case unUSM m init_us of { (r, _) -> r }
+initUs_ init_us m = case unUSM m init_us of { (# r, _ #) -> r }
 
 {-# INLINE thenUs #-}
 {-# INLINE lazyThenUs #-}
@@ -142,27 +142,30 @@ initUs_ init_us m = case unUSM m init_us of { (r, _) -> r }
 
 @thenUs@ is where we split the @UniqSupply@.
 \begin{code}
+liftUSM :: UniqSM a -> UniqSupply -> (a, UniqSupply)
+liftUSM (USM m) us = case m us of (# a, us' #) -> (a, us')
+
 instance MonadFix UniqSM where
-    mfix m = USM (\us -> let (r,us') = unUSM (m r) us in (r,us'))
+    mfix m = USM (\us -> let (r,us') = liftUSM (m r) us in (# r,us' #))
 
 thenUs :: UniqSM a -> (a -> UniqSM b) -> UniqSM b
 thenUs (USM expr) cont
   = USM (\us -> case (expr us) of
-                   (result, us') -> unUSM (cont result) us')
+                   (# result, us' #) -> unUSM (cont result) us')
 
 lazyThenUs :: UniqSM a -> (a -> UniqSM b) -> UniqSM b
-lazyThenUs (USM expr) cont
-  = USM (\us -> let (result, us') = expr us in unUSM (cont result) us')
+lazyThenUs expr cont
+  = USM (\us -> let (result, us') = liftUSM expr us in unUSM (cont result) us')
 
 thenUs_ :: UniqSM a -> UniqSM b -> UniqSM b
 thenUs_ (USM expr) (USM cont)
-  = USM (\us -> case (expr us) of { (_, us') -> cont us' })
+  = USM (\us -> case (expr us) of { (# _, us' #) -> cont us' })
 
 returnUs :: a -> UniqSM a
-returnUs result = USM (\us -> (result, us))
+returnUs result = USM (\us -> (# result, us #))
 
 getUs :: UniqSM UniqSupply
-getUs = USM (\us -> splitUniqSupply us)
+getUs = USM (\us -> case splitUniqSupply us of (us1,us2) -> (# us1, us2 #))
 
 -- | A monad for generating unique identifiers
 class Monad m => MonadUnique m where
@@ -177,17 +180,17 @@ class Monad m => MonadUnique m where
     getUniquesM = liftM uniqsFromSupply getUniqueSupplyM
 
 instance MonadUnique UniqSM where
-    getUniqueSupplyM = USM (\us -> splitUniqSupply us)
+    getUniqueSupplyM = getUs
     getUniqueM  = getUniqueUs
     getUniquesM = getUniquesUs
 
 getUniqueUs :: UniqSM Unique
 getUniqueUs = USM (\us -> case splitUniqSupply us of
-                          (us1,us2) -> (uniqFromSupply us1, us2))
+                          (us1,us2) -> (# uniqFromSupply us1, us2 #))
 
 getUniquesUs :: UniqSM [Unique]
 getUniquesUs = USM (\us -> case splitUniqSupply us of
-                           (us1,us2) -> (uniqsFromSupply us1, us2))
+                           (us1,us2) -> (# uniqsFromSupply us1, us2 #))
 \end{code}
 
 \begin{code}
diff --git a/compiler/cmm/BlockId.hs b/compiler/cmm/BlockId.hs
index feeacb553d..95293c850b 100644
--- a/compiler/cmm/BlockId.hs
+++ b/compiler/cmm/BlockId.hs
@@ -15,7 +15,7 @@ import Outputable
 import Unique
 
 import Compiler.Hoopl as Hoopl hiding (Unique)
-import Compiler.Hoopl.GHC (uniqueToInt, uniqueToLbl, lblToUnique)
+import Compiler.Hoopl.Internals (uniqueToLbl, lblToUnique)
 
 ----------------------------------------------------------------
 --- Block Ids, their environments, and their sets
@@ -32,14 +32,14 @@ compilation unit in which it appears.
 type BlockId = Hoopl.Label
 
 instance Uniquable BlockId where
-  getUnique label = getUnique (uniqueToInt $ lblToUnique label)
-
-mkBlockId :: Unique -> BlockId
-mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
+  getUnique label = getUnique (lblToUnique label)
 
 instance Outputable BlockId where
   ppr label = ppr (getUnique label)
 
+mkBlockId :: Unique -> BlockId
+mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
+
 retPtLbl :: BlockId -> CLabel
 retPtLbl label = mkReturnPtLabel $ getUnique label
 
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index f1318c1dc9..1c77409e49 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -32,9 +32,9 @@ module Cmm (
 import CLabel
 import BlockId
 import CmmNode
-import OptimizationFuel as F
 import SMRep
 import CmmExpr
+import UniqSupply
 import Compiler.Hoopl
 
 import Data.Word        ( Word8 )
@@ -69,8 +69,6 @@ type CmmGroup = GenCmmGroup CmmStatics CmmTopInfo CmmGraph
 --   (a) C--, i.e. populated with various C-- constructs
 --       (Cmm and RawCmm in OldCmm.hs)
 --   (b) Native code, populated with data/instructions
---
--- A second family of instances based on Hoopl is in Cmm.hs.
 
 -- | A top-level chunk, abstracted over the type of the contents of
 -- the basic blocks (Cmm or instructions are the likely instantiations).
@@ -95,19 +93,23 @@ data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
 type CmmBlock = Block CmmNode C C
 
 type CmmReplGraph e x = GenCmmReplGraph CmmNode e x
-type GenCmmReplGraph n e x = FuelUniqSM (Maybe (Graph n e x))
-type CmmFwdRewrite f = FwdRewrite FuelUniqSM CmmNode f
-type CmmBwdRewrite f = BwdRewrite FuelUniqSM CmmNode f
+type GenCmmReplGraph n e x = UniqSM (Maybe (Graph n e x))
+type CmmFwdRewrite f = FwdRewrite UniqSM CmmNode f
+type CmmBwdRewrite f = BwdRewrite UniqSM CmmNode f
 
 -----------------------------------------------------------------------------
 --     Info Tables
 -----------------------------------------------------------------------------
 
-data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
+data CmmTopInfo   = TopInfo { info_tbl :: CmmInfoTable
+                            , stack_info :: CmmStackInfo }
 
 data CmmStackInfo
    = StackInfo {
-       arg_space :: ByteOff,            -- XXX: comment?
+       arg_space :: ByteOff,
+               -- number of bytes of arguments on the stack on entry to the
+               -- the proc.  This is filled in by StgCmm.codeGen, and used
+               -- by the stack allocator later.
        updfr_space :: Maybe ByteOff     -- XXX: comment?
    }
 
diff --git a/compiler/cmm/CmmBuildInfoTables.hs b/compiler/cmm/CmmBuildInfoTables.hs
index ab829de499..ebe755219b 100644
--- a/compiler/cmm/CmmBuildInfoTables.hs
+++ b/compiler/cmm/CmmBuildInfoTables.hs
@@ -14,169 +14,53 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
 module CmmBuildInfoTables
     ( CAFSet, CAFEnv, cafAnal, localCAFInfo, mkTopCAFInfo
-    , setInfoTableSRT, setInfoTableStackMap
+    , setInfoTableSRT
     , TopSRT, emptySRT, srtToData
     , bundleCAFs
-    , lowerSafeForeignCalls
-    , cafTransfers, liveSlotTransfers
-    , mkLiveness )
+    , cafTransfers )
 where
 
 #include "HsVersions.h"
 
 -- These should not be imported here!
-import StgCmmForeign
 import StgCmmUtils
 
-import Constants
 import Digraph
 import qualified Prelude as P
 import Prelude hiding (succ)
-import Util
 
 import BlockId
 import Bitmap
 import CLabel
 import Cmm
 import CmmUtils
-import CmmStackLayout
-import Module
-import FastString
-import ForeignCall
 import IdInfo
 import Data.List
 import Maybes
-import MkGraph as M
-import Control.Monad
 import Name
-import OptimizationFuel
 import Outputable
 import SMRep
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import Data.Map (Map)
 import qualified Data.Map as Map
-import qualified FiniteMap as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+foldSet :: (a -> b -> b) -> b -> Set a -> b
+#if __GLASGOW_HASKELL__ < 704
+foldSet = Set.fold
+#else
+foldSet = Set.foldr
+#endif
 
 ----------------------------------------------------------------
 -- Building InfoTables
 
 
 -----------------------------------------------------------------------
--- Stack Maps
-
--- Given a block ID, we return a representation of the layout of the stack,
--- as suspended before entering that block.
--- (For a return site to a function call, the layout does not include the
---  parameter passing area (or the "return address" on the stack)).
--- If the element is `Nothing`, then it represents a word of the stack that
--- does not contain a live pointer.
--- If the element is `Just` a register, then it represents a live spill slot
--- for a pointer; we assume that a pointer is the size of a word.
--- The head of the list represents the young end of the stack where the infotable
--- pointer for the block `Bid` is stored.
--- The infotable pointer itself is not included in the list.
--- Call areas are also excluded from the list: besides the stuff in the update
--- frame (and the return infotable), call areas should never be live across
--- function calls.
-
--- RTS Invariant: All pointers must be word-aligned because each bit in the bitmap
--- represents a word. Consequently, we have to be careful when we see a live slot
--- on the stack: if we have packed multiple sub-word values into a word,
--- we have to make sure that we only mark the entire word as a non-pointer.
-
--- Also, don't forget to stop at the old end of the stack (oldByte),
--- which may differ depending on whether there is an update frame.
-
-type RegSlotInfo
-   = ( Int	  -- Offset from oldest byte of Old area
-     , LocalReg   -- The register
-     , Int)       -- Width of the register
-
-live_ptrs :: ByteOff -> BlockEnv SubAreaSet -> AreaMap -> BlockId -> StackLayout
-live_ptrs oldByte slotEnv areaMap bid =
-  -- pprTrace "live_ptrs for" (ppr bid <+> text (show oldByte ++ "-" ++ show youngByte) <+>
-  --                           ppr liveSlots) $
-  -- pprTrace ("stack layout for " ++ show bid ++ ": ") (ppr res) $ res
-  res
-  where 
-        res = mkLiveness (reverse $ slotsToList youngByte liveSlots [])
- 
-        slotsToList :: Int -> [RegSlotInfo] -> [Maybe LocalReg] -> [Maybe LocalReg]
-        -- n starts at youngByte and is decremented down to oldByte
-	-- Returns a list, one element per word, with 
-	--    (Just r) meaning 'pointer register r is saved here', 
-	--    Nothing  meaning 'non-pointer or empty'
-
-        slotsToList n [] results | n == oldByte = results -- at old end of stack frame
-
-        slotsToList n (s : _) _  | n == oldByte =
-          pprPanic "slot left off live_ptrs" (ppr s <+> ppr oldByte <+>
-               ppr n <+> ppr liveSlots <+> ppr youngByte)
-
-        slotsToList n _ _ | n < oldByte =
-          panic "stack slots not allocated on word boundaries?"
-
-        slotsToList n l@((n', r, w) : rst) results =
-          if n == (n' + w) then -- slot's young byte is at n
-            ASSERT (not (isPtr r) ||
-                    (n `mod` wORD_SIZE == 0 && w == wORD_SIZE)) -- ptrs must be aligned
-            slotsToList next (dropWhile (non_ptr_younger_than next) rst)
-                        (stack_rep : results)
-          else slotsToList next (dropWhile (non_ptr_younger_than next) l)
-                           (Nothing : results)
-          where next = n - wORD_SIZE
-                stack_rep = if isPtr r then Just r else Nothing
-
-        slotsToList n [] results = slotsToList (n - wORD_SIZE) [] (Nothing : results)
-
-        non_ptr_younger_than next (n', r, w) =
-          n' + w > next &&
-            ASSERT (not (isPtr r))
-            True
-        isPtr = isGcPtrType . localRegType
-
-        liveSlots :: [RegSlotInfo]
-        liveSlots = sortBy (\ (off,_,_) (off',_,_) -> compare off' off)
-                           (Map.foldRightWithKey (\_ -> flip $ foldl add_slot) [] slots)
-                    
-        add_slot :: [RegSlotInfo] -> SubArea -> [RegSlotInfo]
-        add_slot rst (a@(RegSlot r@(LocalReg _ ty)), off, w) = 
-          if off == w && widthInBytes (typeWidth ty) == w then
-            (expectJust "add_slot" (Map.lookup a areaMap), r, w) : rst
-          else panic "live_ptrs: only part of a variable live at a proc point"
-        add_slot rst (CallArea Old, _, _) =
-          rst -- the update frame (or return infotable) should be live
-              -- would be nice to check that only that part of the callarea is live...
-        add_slot rst ((CallArea _), _, _) =
-          rst
-          -- JD: THIS ISN'T CURRENTLY A CORRECTNESS PROBLEM, BUT WE SHOULD REALLY
-          -- MAKE LIVENESS INFO AROUND CALLS MORE PRECISE -- FOR NOW, A 32-BIT
-          -- FLOAT PADS OUT TO 64 BITS, BUT WE ASSUME THE WHOLE PARAMETER-PASSING
-          -- AREA IS LIVE (WHICH IT ISN'T...).  WE SHOULD JUST PUT THE LIVE AREAS
-          -- IN THE CALL NODES, WHICH SHOULD EVENTUALLY HAVE LIVE REGISTER AS WELL,
-          -- SO IT'S ALL GOING IN THE SAME DIRECTION.
-          -- pprPanic "CallAreas must not be live across function calls" (ppr bid <+> ppr c)
-
-        slots :: SubAreaSet	 -- The SubAreaSet for 'bid'
-        slots = expectJust "live_ptrs slots" $ mapLookup bid slotEnv
-        youngByte = expectJust "live_ptrs bid_pos" $ Map.lookup (CallArea (Young bid)) areaMap
-
--- Construct the stack maps for a procedure _if_ it needs an infotable.
--- When wouldn't a procedure need an infotable? If it is a procpoint that
--- is not the successor of a call.
-setInfoTableStackMap :: SlotEnv -> AreaMap -> CmmDecl -> CmmDecl
-setInfoTableStackMap slotEnv areaMap
-     t@(CmmProc (TopInfo {stack_info=StackInfo {updfr_space = Just updfr_off}}) _ 
-                (CmmGraph {g_entry = eid}))
-  = updInfo (const (live_ptrs updfr_off slotEnv areaMap eid)) id t
-setInfoTableStackMap _ _ t = t
-                 
-
-
------------------------------------------------------------------------
 -- SRTs
 
 -- WE NEED AN EXAMPLE HERE.
@@ -191,14 +75,14 @@ setInfoTableStackMap _ _ t = t
 -----------------------------------------------------------------------
 -- Finding the CAFs used by a procedure
 
-type CAFSet = Map CLabel ()
+type CAFSet = Set CLabel
 type CAFEnv = BlockEnv CAFSet
 
 -- First, an analysis to find live CAFs.
 cafLattice :: DataflowLattice CAFSet
-cafLattice = DataflowLattice "live cafs" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case old `Map.union` new of
-                                              new' -> (changeIf $ Map.size new' > Map.size old, new')
+cafLattice = DataflowLattice "live cafs" Set.empty add
+  where add _ (OldFact old) (NewFact new) = case old `Set.union` new of
+                                              new' -> (changeIf $ Set.size new' > Set.size old, new')
 
 cafTransfers :: BwdTransfer CmmNode CAFSet
 cafTransfers = mkBTransfer3 first middle last
@@ -210,11 +94,11 @@ cafTransfers = mkBTransfer3 first middle last
                CmmLit (CmmLabelOff c _)         -> add c set
                CmmLit (CmmLabelDiffOff c1 c2 _) -> add c1 $ add c2 set
                _ -> set
-        add l s = if hasCAF l then Map.insert (toClosureLbl l) () s
+        add l s = if hasCAF l then Set.insert (toClosureLbl l) s
                               else s
 
-cafAnal :: CmmGraph -> FuelUniqSM CAFEnv
-cafAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd cafLattice cafTransfers
+cafAnal :: CmmGraph -> CAFEnv
+cafAnal g = dataflowAnalBwd g [] $ analBwd cafLattice cafTransfers
 
 -----------------------------------------------------------------------
 -- Building the SRTs
@@ -264,15 +148,15 @@ srtToData srt = [CmmData RelocatableReadOnlyData (Statics (lbl srt) tbl)]
 -- we make sure they're all close enough to the bottom of the table that the
 -- bitmap will be able to cover all of them.
 buildSRTs :: TopSRT -> Map CLabel CAFSet -> CAFSet ->
-             FuelUniqSM (TopSRT, Maybe CmmDecl, C_SRT)
+             UniqSM (TopSRT, Maybe CmmDecl, C_SRT)
 buildSRTs topSRT topCAFMap cafs =
-  do let liftCAF lbl () z = -- get CAFs for functions without static closures
-           case Map.lookup lbl topCAFMap of Just cafs -> z `Map.union` cafs
-                                            Nothing   -> Map.insert lbl () z
+  do let liftCAF lbl z = -- get CAFs for functions without static closures
+           case Map.lookup lbl topCAFMap of Just cafs -> z `Set.union` cafs
+                                            Nothing   -> Set.insert lbl z
          -- For each label referring to a function f without a static closure,
          -- replace it with the CAFs that are reachable from f.
          sub_srt topSRT localCafs =
-           let cafs = Map.keys (Map.foldRightWithKey liftCAF Map.empty localCafs)
+           let cafs = Set.elems (foldSet liftCAF Set.empty localCafs)
                mkSRT topSRT =
                  do localSRTs <- procpointSRT (lbl topSRT) (elt_map topSRT) cafs
                     return (topSRT, localSRTs)
@@ -307,7 +191,7 @@ buildSRTs topSRT topCAFMap cafs =
 -- Construct an SRT bitmap.
 -- Adapted from simpleStg/SRT.lhs, which expects Id's.
 procpointSRT :: CLabel -> Map CLabel Int -> [CLabel] ->
-                FuelUniqSM (Maybe CmmDecl, C_SRT)
+                UniqSM (Maybe CmmDecl, C_SRT)
 procpointSRT _ _ [] =
  return (Nothing, NoC_SRT)
 procpointSRT top_srt top_table entries =
@@ -315,7 +199,7 @@ procpointSRT top_srt top_table entries =
     return (top, srt)
   where
     ints = map (expectJust "constructSRT" . flip Map.lookup top_table) entries
-    sorted_ints = sortLe (<=) ints
+    sorted_ints = sort ints
     offset = head sorted_ints
     bitmap_entries = map (subtract offset) sorted_ints
     len = P.last bitmap_entries + 1
@@ -325,7 +209,7 @@ maxBmpSize :: Int
 maxBmpSize = widthInBits wordWidth `div` 2
 
 -- Adapted from codeGen/StgCmmUtils, which converts from SRT to C_SRT.
-to_SRT :: CLabel -> Int -> Int -> Bitmap -> FuelUniqSM (Maybe CmmDecl, C_SRT)
+to_SRT :: CLabel -> Int -> Int -> Bitmap -> UniqSM (Maybe CmmDecl, C_SRT)
 to_SRT top_srt off len bmp
   | len > maxBmpSize || bmp == [fromIntegral srt_escape]
   = do id <- getUniqueM
@@ -373,30 +257,30 @@ mkTopCAFInfo localCAFs = foldl addToTop Map.empty g
           Map.insert l (flatten env cafset) env
         addToTop env (CyclicSCC nodes) =
           let (lbls, cafsets) = unzip nodes
-              cafset  = lbls `Map.deleteList` foldl Map.union Map.empty cafsets
+              cafset  = foldr Set.delete (foldl Set.union Set.empty cafsets) lbls
           in foldl (\env l -> Map.insert l (flatten env cafset) env) env lbls
-        flatten env cafset = Map.foldRightWithKey (lookup env) Map.empty cafset
-        lookup env caf () cafset' =
-          case Map.lookup caf env of Just cafs -> Map.foldRightWithKey add cafset' cafs
-                                     Nothing -> add caf () cafset'
-        add caf () cafset' = Map.insert caf () cafset'
+        flatten env cafset = foldSet (lookup env) Set.empty cafset
+        lookup env caf cafset' =
+          case Map.lookup caf env of Just cafs -> foldSet add cafset' cafs
+                                     Nothing -> add caf cafset'
+        add caf cafset' = Set.insert caf cafset'
         g = stronglyConnCompFromEdgedVertices
-              (map (\n@(l, cafs) -> (n, l, Map.keys cafs)) localCAFs)
+              (map (\n@(l, cafs) -> (n, l, Set.elems cafs)) localCAFs)
 
 -- Bundle the CAFs used at a procpoint.
 bundleCAFs :: CAFEnv -> CmmDecl -> (CAFSet, CmmDecl)
 bundleCAFs cafEnv t@(CmmProc _ _ (CmmGraph {g_entry=entry})) =
   (expectJust "bundleCAFs" (mapLookup entry cafEnv), t)
-bundleCAFs _ t = (Map.empty, t)
+bundleCAFs _ t = (Set.empty, t)
 
 -- Construct the SRTs for the given procedure.
 setInfoTableSRT :: Map CLabel CAFSet -> TopSRT -> (CAFSet, CmmDecl) ->
-                   FuelUniqSM (TopSRT, [CmmDecl])
+                   UniqSM (TopSRT, [CmmDecl])
 setInfoTableSRT topCAFMap topSRT (cafs, t) =
   setSRT cafs topCAFMap topSRT t
 
 setSRT :: CAFSet -> Map CLabel CAFSet -> TopSRT ->
-          CmmDecl -> FuelUniqSM (TopSRT, [CmmDecl])
+          CmmDecl -> UniqSM (TopSRT, [CmmDecl])
 setSRT cafs topCAFMap topSRT t =
   do (topSRT, cafTable, srt) <- buildSRTs topSRT topCAFMap cafs
      let t' = updInfo id (const srt) t
@@ -418,91 +302,3 @@ updInfoTbl toVars toSrt info_tbl@(CmmInfoTable {})
                            StackRep ls -> StackRep (toVars ls)
                            other       -> other }
 updInfoTbl _ _ t@CmmNonInfoTable = t
-  
-----------------------------------------------------------------
--- Safe foreign calls: We need to insert the code that suspends and resumes
--- the thread before and after a safe foreign call.
--- Why do we do this so late in the pipeline?
--- Because we need this code to appear without interrruption: you can't rely on the
--- value of the stack pointer between the call and resetting the thread state;
--- you need to have an infotable on the young end of the stack both when
--- suspending the thread and making the foreign call.
--- All of this is much easier if we insert the suspend and resume calls here.
-
--- At the same time, we prepare for the stages of the compiler that
--- build the proc points. We have to do this at the same time because
--- the safe foreign calls need special treatment with respect to infotables.
--- A safe foreign call needs an infotable even though it isn't
--- a procpoint. The following datatype captures the information
--- needed to generate the infotables along with the Cmm data and procedures.
-
--- JD: Why not do this while splitting procedures?
-lowerSafeForeignCalls :: AreaMap -> CmmDecl -> FuelUniqSM CmmDecl
-lowerSafeForeignCalls _ t@(CmmData _ _) = return t
-lowerSafeForeignCalls areaMap (CmmProc info l g@(CmmGraph {g_entry=entry})) = do
-  let block b mblocks = mblocks >>= lowerSafeCallBlock entry areaMap b
-  blocks <- foldGraphBlocks block (return mapEmpty) g
-  return $ CmmProc info l (ofBlockMap entry blocks)
-
--- If the block ends with a safe call in the block, lower it to an unsafe
--- call (with appropriate saves and restores before and after).
-lowerSafeCallBlock :: BlockId -> AreaMap -> CmmBlock -> BlockEnv CmmBlock
-                              -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeCallBlock entry areaMap b blocks =
-  case blockToNodeList b of
-    (JustC (CmmEntry id), m, JustC l@(CmmForeignCall {})) -> lowerSafeForeignCall entry areaMap blocks id m l
-    _                                                    -> return $ insertBlock b blocks
-
--- Late in the code generator, we want to insert the code necessary
--- to lower a safe foreign call to a sequence of unsafe calls.
-lowerSafeForeignCall :: BlockId -> AreaMap -> BlockEnv CmmBlock -> BlockId -> [CmmNode O O] -> CmmNode O C
-                                -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeForeignCall entry areaMap blocks bid m
-    (CmmForeignCall {tgt=tgt, res=rs, args=as, succ=succ, updfr = updfr_off, intrbl = intrbl}) =
- do let newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
-    -- Both 'id' and 'new_base' are KindNonPtr because they're
-    -- RTS-only objects and are not subject to garbage collection
-    id <- newTemp bWord
-    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
-    let (caller_save, caller_load) = callerSaveVolatileRegs
-    load_tso <- newTemp gcWord -- TODO FIXME NOW
-    load_stack <- newTemp gcWord -- TODO FIXME NOW
-    let (<**>) = (M.<*>)
-    let suspendThread = foreignLbl "suspendThread"
-        resumeThread  = foreignLbl "resumeThread"
-        foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit name)))
-        suspend = saveThreadState <**>
-                  caller_save <**>
-                  mkUnsafeCall (ForeignTarget suspendThread
-                                (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
-                               [id] [CmmReg (CmmGlobal BaseReg), CmmLit (CmmInt (fromIntegral (fromEnum intrbl)) wordWidth)]
-        midCall = mkUnsafeCall tgt rs as
-        resume  = mkUnsafeCall (ForeignTarget resumeThread
-                                (ForeignConvention CCallConv [AddrHint] [AddrHint]))
-                     [new_base] [CmmReg (CmmLocal id)] <**>
-                  -- Assign the result to BaseReg: we
-                  -- might now have a different Capability!
-                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <**>
-                  caller_load <**>
-                  loadThreadState load_tso load_stack
-        -- We have to save the return value on the stack because its next use
-        -- may appear in a different procedure due to procpoint splitting...
-        saveRetVals = foldl (<**>) emptyAGraph $ map (M.mkMiddle . spill) rs
-        spill r = CmmStore (regSlot r) (CmmReg $ CmmLocal r)
-        regSlot r@(LocalReg _ _) = CmmRegOff (CmmGlobal Sp) (sp_off - offset)
-          where offset = w + expectJust "lowerForeign" (Map.lookup (RegSlot r) areaMap)
-                sp_off = wORD_SIZE + expectJust "lowerForeign" (Map.lookup (CallArea area) areaMap)
-                area = if succ == entry then Old else Young succ
-                w = widthInBytes $ typeWidth $ localRegType r
-        -- Note: The successor must be a procpoint, and we have already split,
-        --       so we use a jump, not a branch.
-        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
-        jump = CmmCall { cml_target  = succLbl, cml_cont = Nothing
-                       , cml_args    = widthInBytes wordWidth ,cml_ret_args = 0
-                       , cml_ret_off = updfr_off}
-    graph' <- liftUniq $ labelAGraph bid $ catAGraphs (map M.mkMiddle m) <**>
-                                           suspend <**> midCall <**>
-                                           resume  <**> saveRetVals <**> M.mkLast jump
-    return $ blocks `mapUnion` toBlockMap graph'
-lowerSafeForeignCall _ _ _ _ _ _ = panic "lowerSafeForeignCall was passed something else"
-
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index d3d9ba4b41..a76ad6f00a 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -7,7 +7,8 @@
 
 module CmmCallConv (
   ParamLocation(..),
-  assignArgumentsPos
+  assignArgumentsPos,
+  globalArgRegs
 ) where
 
 #include "HsVersions.h"
@@ -53,7 +54,6 @@ assignArgumentsPos conv arg_ty reps = assignments
                ([_], PrimOpReturn)     -> allRegs
                (_,   PrimOpReturn)     -> getRegsWithNode
                (_,   Slow)             -> noRegs
-               _ -> pprPanic "Unknown calling convention" (ppr conv)
       -- The calling conventions first assign arguments to registers,
       -- then switch to the stack when we first run out of registers
       -- (even if there are still available registers for args of a different type).
@@ -130,18 +130,25 @@ getRegsWithNode =
   (intRegs, map FloatReg  floatRegNos, map DoubleReg doubleRegNos, map LongReg longRegNos)
     where intRegs = map VanillaReg vanillaRegNos
 
-allVanillaRegNos, allFloatRegNos, allDoubleRegNos, allLongRegNos :: [Int]
-allVanillaRegNos = regList mAX_Vanilla_REG
-allFloatRegNos	 = regList mAX_Float_REG
-allDoubleRegNos	 = regList mAX_Double_REG
-allLongRegNos	   = regList mAX_Long_REG
+allFloatRegs, allDoubleRegs, allLongRegs :: [GlobalReg]
+allVanillaRegs :: [VGcPtr -> GlobalReg]
+
+allVanillaRegs = map VanillaReg $ regList mAX_Vanilla_REG
+allFloatRegs   = map FloatReg   $ regList mAX_Float_REG
+allDoubleRegs  = map DoubleReg  $ regList mAX_Double_REG
+allLongRegs    = map LongReg    $ regList mAX_Long_REG
 
 regList :: Int -> [Int]
 regList n = [1 .. n]
 
 allRegs :: AvailRegs
-allRegs = (map VanillaReg allVanillaRegNos, map FloatReg allFloatRegNos,
-           map DoubleReg  allDoubleRegNos,  map LongReg  allLongRegNos)
+allRegs  = (allVanillaRegs, allFloatRegs, allDoubleRegs, allLongRegs)
 
 noRegs :: AvailRegs
-noRegs    = ([], [], [], [])
+noRegs  = ([], [], [], [])
+
+globalArgRegs :: [GlobalReg]
+globalArgRegs = map ($VGcPtr) allVanillaRegs ++
+                allFloatRegs ++
+                allDoubleRegs ++
+                allLongRegs
diff --git a/compiler/cmm/CmmCommonBlockElim.hs b/compiler/cmm/CmmCommonBlockElim.hs
index abbfd01156..614edf23a2 100644
--- a/compiler/cmm/CmmCommonBlockElim.hs
+++ b/compiler/cmm/CmmCommonBlockElim.hs
@@ -13,22 +13,22 @@ where
 import BlockId
 import Cmm
 import CmmUtils
+import CmmContFlowOpt
 import Prelude hiding (iterate, succ, unzip, zip)
 
-import Compiler.Hoopl
+import Hoopl hiding (ChangeFlag)
 import Data.Bits
 import qualified Data.List as List
 import Data.Word
-import FastString
-import Control.Monad
 import Outputable
 import UniqFM
-import Unique
 
 my_trace :: String -> SDoc -> a -> a
 my_trace = if False then pprTrace else \_ _ a -> a
 
--- Eliminate common blocks:
+-- -----------------------------------------------------------------------------
+-- Eliminate common blocks
+
 -- If two blocks are identical except for the label on the first node,
 -- then we can eliminate one of the blocks. To ensure that the semantics
 -- of the program are preserved, we have to rewrite each predecessor of the
@@ -42,59 +42,50 @@ my_trace = if False then pprTrace else \_ _ a -> a
 
 -- TODO: Use optimization fuel
 elimCommonBlocks :: CmmGraph -> CmmGraph
-elimCommonBlocks g =
-    upd_graph g . snd $ iterate common_block reset hashed_blocks
-                                (emptyUFM, mapEmpty)
-      where hashed_blocks    = map (\b -> (hash_block b, b)) (reverse (postorderDfs g))
-            reset (_, subst) = (emptyUFM, subst)
+elimCommonBlocks g = replaceLabels env g
+  where
+     env = iterate hashed_blocks mapEmpty
+     hashed_blocks = map (\b -> (hash_block b, b)) $ postorderDfs g
 
 -- Iterate over the blocks until convergence
-iterate :: (t -> a -> (Bool, t)) -> (t -> t) -> [a] -> t -> t
-iterate upd reset blocks state =
-  case foldl upd' (False, state) blocks of
-    (True,  state') -> iterate upd reset blocks (reset state')
-    (False, state') -> state'
-  where upd' (b, s) a = let (b', s') = upd s a in (b || b', s') -- lift to track changes
+iterate :: [(HashCode,CmmBlock)] -> BlockEnv BlockId -> BlockEnv BlockId
+iterate blocks subst =
+  case foldl common_block (False, emptyUFM, subst) blocks of
+    (changed,  _, subst)
+       | changed   -> iterate blocks subst
+       | otherwise -> subst
+
+type State  = (ChangeFlag, UniqFM [CmmBlock], BlockEnv BlockId)
+
+type ChangeFlag = Bool
+type HashCode = Int
 
 -- Try to find a block that is equal (or ``common'') to b.
-type BidMap = BlockEnv BlockId
-type State  = (UniqFM [CmmBlock], BidMap)
-common_block :: (Outputable h, Uniquable h) =>  State -> (h, CmmBlock) -> (Bool, State)
-common_block (bmap, subst) (hash, b) =
+common_block :: State -> (HashCode, CmmBlock) -> State
+common_block (old_change, bmap, subst) (hash, b) =
   case lookupUFM bmap hash of
     Just bs -> case (List.find (eqBlockBodyWith (eqBid subst) b) bs,
                      mapLookup bid subst) of
                  (Just b', Nothing)                         -> addSubst b'
                  (Just b', Just b'') | entryLabel b' /= b'' -> addSubst b'
-                 _ -> (False, (addToUFM bmap hash (b : bs), subst))
-    Nothing -> (False, (addToUFM bmap hash [b], subst))
+                                     | otherwise -> (old_change, bmap, subst)
+                 _ -> (old_change, addToUFM bmap hash (b : bs), subst)
+    Nothing -> (old_change, addToUFM bmap hash [b], subst)
   where bid = entryLabel b
-        addSubst b' = my_trace "found new common block" (ppr (entryLabel b')) $
-                      (True, (bmap, mapInsert bid (entryLabel b') subst))
-
--- Given the map ``subst'' from BlockId -> BlockId, we rewrite the graph.
-upd_graph :: CmmGraph -> BidMap -> CmmGraph
-upd_graph g subst = mapGraphNodes (id, middle, last) g
-  where middle = mapExpDeep exp
-        last l = last' (mapExpDeep exp l)
-        last' :: CmmNode O C -> CmmNode O C
-        last' (CmmBranch bid)              = CmmBranch $ sub bid
-        last' (CmmCondBranch p t f)        = cond p (sub t) (sub f)
-        last' (CmmCall t (Just bid) a r o) = CmmCall t (Just $ sub bid) a r o
-        last' l@(CmmCall _ Nothing _ _ _)  = l
-        last' (CmmForeignCall t r a bid u i) = CmmForeignCall t r a (sub bid) u i
-        last' (CmmSwitch e bs)             = CmmSwitch e $ map (liftM sub) bs
-        cond p t f = if t == f then CmmBranch t else CmmCondBranch p t f
-        exp (CmmStackSlot (CallArea (Young id))       off) =
-             CmmStackSlot (CallArea (Young (sub id))) off
-        exp (CmmLit (CmmBlock id)) = CmmLit (CmmBlock (sub id))
-        exp e = e
-        sub = lookupBid subst
+        addSubst b' = my_trace "found new common block" (ppr bid <> char '=' <> ppr (entryLabel b')) $
+                      (True, bmap, mapInsert bid (entryLabel b') subst)
+
+
+-- -----------------------------------------------------------------------------
+-- Hashing and equality on blocks
+
+-- Below here is mostly boilerplate: hashing blocks ignoring labels,
+-- and comparing blocks modulo a label mapping.
 
 -- To speed up comparisons, we hash each basic block modulo labels.
 -- The hashing is a bit arbitrary (the numbers are completely arbitrary),
 -- but it should be fast and good enough.
-hash_block :: CmmBlock -> Int
+hash_block :: CmmBlock -> HashCode
 hash_block block =
   fromIntegral (foldBlockNodesB3 (hash_fst, hash_mid, hash_lst) block (0 :: Word32) .&. (0x7fffffff :: Word32))
   -- UniqFM doesn't like negative Ints
@@ -103,13 +94,13 @@ hash_block block =
         hash_lst m h = hash_node m + h `shiftL` 1
 
         hash_node :: CmmNode O x -> Word32
-        hash_node (CmmComment (FastString u _ _ _ _)) = cvt u
+        hash_node (CmmComment _) = 0 -- don't care
         hash_node (CmmAssign r e) = hash_reg r + hash_e e
         hash_node (CmmStore e e') = hash_e e + hash_e e'
         hash_node (CmmUnsafeForeignCall t _ as) = hash_tgt t + hash_list hash_e as
-        hash_node (CmmBranch _) = 23 -- would be great to hash these properly
+        hash_node (CmmBranch _) = 23 -- NB. ignore the label
         hash_node (CmmCondBranch p _ _) = hash_e p
-        hash_node (CmmCall e _ _ _ _) = hash_e e
+        hash_node (CmmCall e _ _ _ _ _) = hash_e e
         hash_node (CmmForeignCall t _ _ _ _ _) = hash_tgt t
         hash_node (CmmSwitch e _) = hash_e e
 
@@ -143,25 +134,67 @@ hash_block block =
 -- Utilities: equality and substitution on the graph.
 
 -- Given a map ``subst'' from BlockID -> BlockID, we define equality.
-eqBid :: BidMap -> BlockId -> BlockId -> Bool
+eqBid :: BlockEnv BlockId -> BlockId -> BlockId -> Bool
 eqBid subst bid bid' = lookupBid subst bid == lookupBid subst bid'
-lookupBid :: BidMap -> BlockId -> BlockId
+lookupBid :: BlockEnv BlockId -> BlockId -> BlockId
 lookupBid subst bid = case mapLookup bid subst of
                         Just bid  -> lookupBid subst bid
                         Nothing -> bid
 
--- Equality on the body of a block, modulo a function mapping block IDs to block IDs.
+-- Middle nodes and expressions can contain BlockIds, in particular in
+-- CmmStackSlot and CmmBlock, so we have to use a special equality for
+-- these.
+--
+eqMiddleWith :: (BlockId -> BlockId -> Bool)
+             -> CmmNode O O -> CmmNode O O -> Bool
+eqMiddleWith _ (CmmComment _) (CmmComment _) = True
+eqMiddleWith eqBid (CmmAssign r1 e1) (CmmAssign r2 e2)
+  = r1 == r2 && eqExprWith eqBid e1 e2
+eqMiddleWith eqBid (CmmStore l1 r1) (CmmStore l2 r2)
+  = eqExprWith eqBid l1 l2 && eqExprWith eqBid r1 r2
+eqMiddleWith eqBid (CmmUnsafeForeignCall t1 r1 a1)
+                   (CmmUnsafeForeignCall t2 r2 a2)
+  = t1 == t2 && r1 == r2 && and (zipWith (eqExprWith eqBid) a1 a2)
+eqMiddleWith _ _ _ = False
+
+eqExprWith :: (BlockId -> BlockId -> Bool)
+           -> CmmExpr -> CmmExpr -> Bool
+eqExprWith eqBid = eq
+ where
+  CmmLit l1          `eq` CmmLit l2          = eqLit l1 l2
+  CmmLoad e1 _       `eq` CmmLoad e2 _       = e1 `eq` e2
+  CmmReg r1          `eq` CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    `eq` CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  `eq` CmmMachOp op2 es2  = op1==op2 && es1 `eqs` es2
+  CmmStackSlot a1 i1 `eq` CmmStackSlot a2 i2 = eqArea a1 a2 && i1==i2
+  _e1                `eq` _e2                = False
+
+  xs `eqs` ys = and (zipWith eq xs ys)
+
+  eqLit (CmmBlock id1) (CmmBlock id2) = eqBid id1 id2
+  eqLit l1 l2 = l1 == l2
+
+  eqArea Old Old = True
+  eqArea (Young id1) (Young id2) = eqBid id1 id2
+  eqArea _ _ = False
+
+-- Equality on the body of a block, modulo a function mapping block
+-- IDs to block IDs.
 eqBlockBodyWith :: (BlockId -> BlockId -> Bool) -> CmmBlock -> CmmBlock -> Bool
-eqBlockBodyWith eqBid block block' = middles == middles' && eqLastWith eqBid last last'
-  where (_, middles , JustC last  :: MaybeC C (CmmNode O C)) = blockToNodeList block
-        (_, middles', JustC last' :: MaybeC C (CmmNode O C)) = blockToNodeList block'
+eqBlockBodyWith eqBid block block'
+  = and (zipWith (eqMiddleWith eqBid) (blockToList m) (blockToList m')) &&
+    eqLastWith eqBid l l'
+  where (_,m,l)   = blockSplit block
+        (_,m',l') = blockSplit block'
+
+
 
 eqLastWith :: (BlockId -> BlockId -> Bool) -> CmmNode O C -> CmmNode O C -> Bool
 eqLastWith eqBid (CmmBranch bid1) (CmmBranch bid2) = eqBid bid1 bid2
 eqLastWith eqBid (CmmCondBranch c1 t1 f1) (CmmCondBranch c2 t2 f2) =
   c1 == c2 && eqBid t1 t2 && eqBid f1 f2
-eqLastWith eqBid (CmmCall t1 c1 a1 r1 u1) (CmmCall t2 c2 a2 r2 u2) =
-  t1 == t2 && eqMaybeWith eqBid c1 c2 && a1 == a2 && r1 == r2 && u1 == u2
+eqLastWith eqBid (CmmCall t1 c1 g1 a1 r1 u1) (CmmCall t2 c2 g2 a2 r2 u2) =
+  t1 == t2 && eqMaybeWith eqBid c1 c2 && a1 == a2 && r1 == r2 && u1 == u2 && g1 == g2
 eqLastWith eqBid (CmmSwitch e1 bs1) (CmmSwitch e2 bs2) =
   e1 == e2 && eqListWith (eqMaybeWith eqBid) bs1 bs2
 eqLastWith _ _ _ = False
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index 73ce57e93f..f9fa68062e 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -2,19 +2,19 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations -fno-warn-incomplete-patterns #-}
 
 module CmmContFlowOpt
-    ( runCmmContFlowOpts
-    , removeUnreachableBlocks, replaceBranches
+    ( cmmCfgOpts
+    , cmmCfgOptsProc
+    , removeUnreachableBlocks
+    , replaceLabels
     )
 where
 
 import BlockId
 import Cmm
 import CmmUtils
-import Digraph
 import Maybes
-import Outputable
 
-import Compiler.Hoopl
+import Hoopl
 import Control.Monad
 import Prelude hiding (succ, unzip, zip)
 
@@ -24,196 +24,189 @@ import Prelude hiding (succ, unzip, zip)
 --
 -----------------------------------------------------------------------------
 
-runCmmContFlowOpts :: CmmGroup -> CmmGroup
-runCmmContFlowOpts = map (optProc cmmCfgOpts)
-
 cmmCfgOpts :: CmmGraph -> CmmGraph
-cmmCfgOpts = removeUnreachableBlocks . blockConcat . branchChainElim
-        -- Here branchChainElim can ultimately be replaced
-        -- with a more exciting combination of optimisations
+cmmCfgOpts = removeUnreachableBlocks . blockConcat
+
+cmmCfgOptsProc :: CmmDecl -> CmmDecl
+cmmCfgOptsProc = optProc cmmCfgOpts
 
 optProc :: (g -> g) -> GenCmmDecl d h g -> GenCmmDecl d h g
 optProc opt (CmmProc info lbl g) = CmmProc info lbl (opt g)
 optProc _   top                  = top
 
+
 -----------------------------------------------------------------------------
 --
--- Branch Chain Elimination
+-- Block concatenation
 --
 -----------------------------------------------------------------------------
 
--- | Remove any basic block of the form L: goto L', and replace L with
--- L' everywhere else, unless L is the successor of a call instruction
--- and L' is the entry block. You don't want to set the successor of a
--- function call to the entry block because there is no good way to
--- store both the infotables for the call and from the callee, while
--- putting the stack pointer in a consistent place.
+-- This optimisation does two things:
+--   - If a block finishes with an unconditional branch, then we may
+--     be able to concatenate the block it points to and remove the
+--     branch.  We do this either if the destination block is small
+--     (e.g. just another branch), or if this is the only jump to
+--     this particular destination block.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto.
+--
+-- Both transformations are improved by working from the end of the
+-- graph towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- We need to walk over the blocks from the end back to the
+-- beginning.  We are going to maintain the "current" graph
+-- (BlockEnv CmmBlock) as we go, and also a mapping from BlockId
+-- to BlockId, representing continuation labels that we have
+-- renamed.  This latter mapping is important because we might
+-- shortcut a CmmCall continuation.  For example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--
+--    L: goto M
 --
--- JD isn't quite sure when it's safe to share continuations for different
--- function calls -- have to think about where the SP will be,
--- so we'll table that problem for now by leaving all call successors alone.
-
-branchChainElim :: CmmGraph -> CmmGraph
-branchChainElim g
-  | null lone_branch_blocks = g    -- No blocks to remove
-  | otherwise               = {- pprTrace "branchChainElim" (ppr forest) $ -}
-                              replaceLabels (mapFromList edges) g
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L).  So we keep track of
+-- which labels we have renamed and apply the mapping at the end
+-- with replaceLabels.
+
+blockConcat  :: CmmGraph -> CmmGraph
+blockConcat g@CmmGraph { g_entry = entry_id }
+  = replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks
   where
-    blocks = toBlockList g
-
-    lone_branch_blocks :: [(BlockId, BlockId)]
-      -- each (L,K) is a block of the form
-      --   L : goto K
-    lone_branch_blocks = mapCatMaybes isLoneBranch blocks
-
-    call_succs = foldl add emptyBlockSet blocks
-      where add :: BlockSet -> CmmBlock -> BlockSet
-            add succs b =
-              case lastNode b of
-                (CmmCall _ (Just k) _ _ _) -> setInsert k succs
-                (CmmForeignCall {succ=k})  -> setInsert k succs
-                _                          -> succs
-
-    isLoneBranch :: CmmBlock -> Maybe (BlockId, BlockId)
-    isLoneBranch block
-      | (JustC (CmmEntry id), [], JustC (CmmBranch target)) <- blockToNodeList block
-      , not (setMember id call_succs)
-      = Just (id,target)
-      | otherwise
-      = Nothing
-
-    -- We build a graph from lone_branch_blocks (every node has only
-    -- one out edge).  Then we
-    --   - topologically sort the graph: if from A we can reach B,
-    --     then A occurs before B in the result list.
-    --   - depth-first search starting from the nodes in this list.
-    --     This gives us a [[node]], in which each list is a dependency
-    --     chain.
-    --   - for each list [a1,a2,...an] replace branches to ai with an.
-    --
-    -- This approach nicely deals with cycles by ignoring them.
-    -- Branches in a cycle will be redirected to somewhere in the
-    -- cycle, but we don't really care where.  A cycle should be dead code,
-    -- and so will be eliminated by removeUnreachableBlocks.
-    --
-    fromNode (b,_) = b
-    toNode   a     = (a,a)
-
-    all_block_ids :: LabelSet
-    all_block_ids = setFromList (map fst lone_branch_blocks)
-                      `setUnion`
-                    setFromList (map snd lone_branch_blocks)
-
-    forest = dfsTopSortG $ graphFromVerticesAndAdjacency nodes lone_branch_blocks
-        where nodes = map toNode $ setElems $ all_block_ids
-
-    edges  = [ (fromNode y, fromNode x)
-             | (x:xs) <- map reverse forest, y <- xs ]
+     -- we might be able to shortcut the entry BlockId itself
+     new_entry
+       | Just entry_blk <- mapLookup entry_id new_blocks
+       , Just dest      <- canShortcut entry_blk
+       = dest
+       | otherwise
+       = entry_id
 
-----------------------------------------------------------------
+     blocks = postorderDfs g
+
+     (new_blocks, shortcut_map) =
+           foldr maybe_concat (toBlockMap g, mapEmpty) blocks
+
+     maybe_concat :: CmmBlock
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+     maybe_concat block (blocks, shortcut_map)
+        | CmmBranch b' <- last
+        , Just blk' <- mapLookup b' blocks
+        , shouldConcatWith b' blk'
+        = (mapInsert bid (splice head blk') blocks, shortcut_map)
+
+        -- calls: if we can shortcut the continuation label, then
+        -- we must *also* remember to substitute for the label in the
+        -- code, because we will push it somewhere.
+        | Just b'   <- callContinuation_maybe last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = (blocks, mapInsert b' dest shortcut_map)
+           -- replaceLabels will substitute dest for b' everywhere, later
+
+        -- non-calls: see if we can shortcut any of the successors.
+        | Nothing <- callContinuation_maybe last
+        = ( mapInsert bid (blockJoinTail head shortcut_last) blocks
+          , shortcut_map )
+
+        | otherwise
+        = (blocks, shortcut_map)
+        where
+          (head, last) = blockSplitTail block
+          bid = entryLabel block
+          shortcut_last = mapSuccessors shortcut last
+          shortcut l =
+             case mapLookup l blocks of
+               Just b | Just dest <- canShortcut b  -> dest
+               _otherwise -> l
+
+     shouldConcatWith b block
+       | num_preds b == 1    = True  -- only one predecessor: go for it
+       | okToDuplicate block = True  -- short enough to duplicate
+       | otherwise           = False
+       where num_preds bid = mapLookup bid backEdges `orElse` 0
+
+     canShortcut :: CmmBlock -> Maybe BlockId
+     canShortcut block
+       | (_, middle, CmmBranch dest) <- blockSplit block
+       , isEmptyBlock middle
+       = Just dest
+       | otherwise
+       = Nothing
+
+     backEdges :: BlockEnv Int -- number of predecessors for each block
+     backEdges = mapInsertWith (+) entry_id 1 $ -- add 1 for the entry id
+                   mapMap setSize $ predMap blocks
+
+     splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+     splice head rest = head `blockAppend` snd (blockSplitHead rest)
+
+
+callContinuation_maybe :: CmmNode O C -> Maybe BlockId
+callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
+callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
+callContinuation_maybe _ = Nothing
+
+okToDuplicate :: CmmBlock -> Bool
+okToDuplicate block
+  = case blockSplit block of
+      (_, m, CmmBranch _) -> isEmptyBlock m
+      -- cheap and cheerful; we might expand this in the future to
+      -- e.g. spot blocks that represent a single instruction or two.
+      -- Be careful: a CmmCall can be more than one instruction, it
+      -- has a CmmExpr inside it.
+      _otherwise -> False
+
+------------------------------------------------------------------------
+-- Map over the CmmGraph, replacing each label with its mapping in the
+-- supplied BlockEnv.
 
 replaceLabels :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceLabels env =
-  replace_eid . mapGraphNodes1 txnode
+replaceLabels env g
+  | mapNull env = g
+  | otherwise   = replace_eid $ mapGraphNodes1 txnode g
    where
      replace_eid g = g {g_entry = lookup (g_entry g)}
      lookup id = mapLookup id env `orElse` id
 
      txnode :: CmmNode e x -> CmmNode e x
      txnode (CmmBranch bid)         = CmmBranch (lookup bid)
-     txnode (CmmCondBranch p t f)   = CmmCondBranch (exp p) (lookup t) (lookup f)
+     txnode (CmmCondBranch p t f)   = mkCmmCondBranch (exp p) (lookup t) (lookup f)
      txnode (CmmSwitch e arms)      = CmmSwitch (exp e) (map (liftM lookup) arms)
-     txnode (CmmCall t k a res r)   = CmmCall (exp t) (liftM lookup k) a res r
+     txnode (CmmCall t k rg a res r) = CmmCall (exp t) (liftM lookup k) rg a res r
      txnode fc@CmmForeignCall{}     = fc{ args = map exp (args fc)
                                         , succ = lookup (succ fc) }
      txnode other                   = mapExpDeep exp other
 
      exp :: CmmExpr -> CmmExpr
      exp (CmmLit (CmmBlock bid))                = CmmLit (CmmBlock (lookup bid))
-     exp (CmmStackSlot (CallArea (Young id)) i) = CmmStackSlot (CallArea (Young (lookup id))) i
+     exp (CmmStackSlot (Young id) i) = CmmStackSlot (Young (lookup id)) i
      exp e                                      = e
 
-
-replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceBranches env g = mapGraphNodes (id, id, last) g
-  where
-    last :: CmmNode O C -> CmmNode O C
-    last (CmmBranch id)          = CmmBranch (lookup id)
-    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
-    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
-    last l@(CmmCall {})          = l
-    last l@(CmmForeignCall {})   = l
-    lookup id = fmap lookup (mapLookup id env) `orElse` id
-            -- XXX: this is a recursive lookup, it follows chains until the lookup
-            -- returns Nothing, at which point we return the last BlockId
+mkCmmCondBranch :: CmmExpr -> Label -> Label -> CmmNode O C
+mkCmmCondBranch p t f = if t == f then CmmBranch t else CmmCondBranch p t f
 
 ----------------------------------------------------------------
 -- Build a map from a block to its set of predecessors. Very useful.
+
 predMap :: [CmmBlock] -> BlockEnv BlockSet
 predMap blocks = foldr add_preds mapEmpty blocks -- find the back edges
   where add_preds block env = foldl (add (entryLabel block)) env (successors block)
         add bid env b' =
           mapInsert b' (setInsert bid (mapLookup b' env `orElse` setEmpty)) env
 
------------------------------------------------------------------------------
---
--- Block concatenation
---
------------------------------------------------------------------------------
-
--- If a block B branches to a label L, L is not the entry block,
--- and L has no other predecessors,
--- then we can splice the block starting with L onto the end of B.
--- Order matters, so we work bottom up (reverse postorder DFS).
--- This optimization can be inhibited by unreachable blocks, but
--- the reverse postorder DFS returns only reachable blocks.
---
--- To ensure correctness, we have to make sure that the BlockId of the block
--- we are about to eliminate is not named in another instruction.
---
--- Note: This optimization does _not_ subsume branch chain elimination.
-
-blockConcat  :: CmmGraph -> CmmGraph
-blockConcat g@(CmmGraph {g_entry=eid}) =
-  replaceLabels concatMap $ ofBlockMap (g_entry g) blocks'
-  where
-     blocks = postorderDfs g
-
-     (blocks', concatMap) =
-           foldr maybe_concat (toBlockMap g, mapEmpty) $ blocks
-
-     maybe_concat :: CmmBlock -> (LabelMap CmmBlock, LabelMap Label) -> (LabelMap CmmBlock, LabelMap Label)
-     maybe_concat b unchanged@(blocks', concatMap) =
-       let bid = entryLabel b
-       in case blockToNodeList b of
-            (JustC h, m, JustC (CmmBranch b')) ->
-               if canConcatWith b' then
-                 (mapInsert bid (splice blocks' h m b') blocks',
-                  mapInsert b' bid concatMap)
-               else unchanged
-            _ -> unchanged
-
-     num_preds bid = liftM setSize (mapLookup bid backEdges) `orElse` 0
-
-     canConcatWith b' = b' /= eid && num_preds b' == 1
-
-     backEdges = predMap blocks
-
-     splice :: forall map n e x.
-               IsMap map =>
-               map (Block n e x) -> n C O -> [n O O] -> KeyOf map -> Block n C x
-     splice blocks' h m bid' =
-          case mapLookup bid' blocks' of
-            Nothing -> panic "unknown successor block"
-            Just block | (_, m', l') <- blockToNodeList block
-                -> blockOfNodeList (JustC h, (m ++ m'), l')
-
 
 -----------------------------------------------------------------------------
 --
 -- Removing unreachable blocks
---
------------------------------------------------------------------------------
 
 removeUnreachableBlocks :: CmmGraph -> CmmGraph
 removeUnreachableBlocks g
diff --git a/compiler/cmm/CmmCvt.hs b/compiler/cmm/CmmCvt.hs
index 80c6079aac..204f26e24b 100644
--- a/compiler/cmm/CmmCvt.hs
+++ b/compiler/cmm/CmmCvt.hs
@@ -12,29 +12,25 @@ import CmmUtils
 import qualified OldCmm as Old
 import OldPprCmm ()
 
-import Compiler.Hoopl hiding ((<*>), mkLabel, mkBranch)
+import Hoopl hiding ((<*>), mkLabel, mkBranch)
 import Data.Maybe
 import Maybes
 import Outputable
 
 cmmOfZgraph :: CmmGroup -> Old.CmmGroup
 cmmOfZgraph tops = map mapTop tops
-  where mapTop (CmmProc h l g) = CmmProc (Old.CmmInfo Nothing Nothing (info_tbl h)) l (ofZgraph g)
+  where mapTop (CmmProc h l g) = CmmProc (info_tbl h) l (ofZgraph g)
         mapTop (CmmData s ds) = CmmData s ds
 
 data ValueDirection = Arguments | Results
 
-add_hints :: Convention -> ValueDirection -> [a] -> [Old.CmmHinted a]
+add_hints :: ForeignTarget -> ValueDirection -> [a] -> [Old.CmmHinted a]
 add_hints conv vd args = zipWith Old.CmmHinted args (get_hints conv vd)
 
-get_hints :: Convention -> ValueDirection -> [ForeignHint]
-get_hints (Foreign (ForeignConvention _ hints _)) Arguments = hints
-get_hints (Foreign (ForeignConvention _ _ hints)) Results   = hints
-get_hints _other_conv                             _vd       = repeat NoHint
-
-get_conv :: ForeignTarget -> Convention
-get_conv (PrimTarget _)       = NativeNodeCall -- JD: SUSPICIOUS
-get_conv (ForeignTarget _ fc) = Foreign fc
+get_hints :: ForeignTarget -> ValueDirection -> [ForeignHint]
+get_hints (ForeignTarget _ (ForeignConvention _ hints _)) Arguments = hints
+get_hints (ForeignTarget _ (ForeignConvention _ _ hints)) Results   = hints
+get_hints (PrimTarget _) _vd = repeat NoHint
 
 cmm_target :: ForeignTarget -> Old.CmmCallTarget
 cmm_target (PrimTarget op) = Old.CmmPrim op Nothing
@@ -89,8 +85,8 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                             CmmUnsafeForeignCall (PrimTarget MO_Touch) _ _ -> Old.CmmNop
                             CmmUnsafeForeignCall target ress args          -> 
                               Old.CmmCall (cmm_target target)
-                                          (add_hints (get_conv target) Results   ress)
-                                          (add_hints (get_conv target) Arguments args)
+                                          (add_hints target Results   ress)
+                                          (add_hints target Arguments args)
                                           Old.CmmMayReturn
 
                   last :: CmmNode O C -> () -> [Old.CmmStmt]
@@ -106,7 +102,7 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                               | otherwise -> [Old.CmmCondBranch expr tid, Old.CmmBranch fid]
                             CmmSwitch arg ids -> [Old.CmmSwitch arg ids]
                             -- ToDo: STG Live
-                            CmmCall e _ _ _ _ -> [Old.CmmJump e Nothing]
+                            CmmCall e _ r _ _ _ -> [Old.CmmJump e (Just r)]
                             CmmForeignCall {} -> panic "ofZgraph: CmmForeignCall"
                           tail_of bid = case foldBlockNodesB3 (first, middle, last) block () of
                                           Old.BasicBlock _ stmts -> stmts
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 6eb91e89ba..646ecb5c67 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,11 +14,11 @@ module CmmExpr
     , GlobalReg(..), globalRegType, spReg, hpReg, spLimReg, nodeReg, node, baseReg
     , VGcPtr(..), vgcFlag 	-- Temporary!
     , DefinerOfLocalRegs, UserOfLocalRegs, foldRegsDefd, foldRegsUsed, filterRegsUsed
-    , DefinerOfSlots, UserOfSlots, foldSlotsDefd, foldSlotsUsed
     , RegSet, emptyRegSet, elemRegSet, extendRegSet, deleteFromRegSet, mkRegSet
-            , plusRegSet, minusRegSet, timesRegSet
-    , regUsedIn, regSlot
-    , Area(..), AreaId(..), SubArea, SubAreaSet, AreaMap, isStackSlotOf
+            , plusRegSet, minusRegSet, timesRegSet, sizeRegSet, nullRegSet
+            , regSetToList
+    , regUsedIn
+    , Area(..)
     , module CmmMachOp
     , module CmmType
     )
@@ -31,9 +31,9 @@ import CmmMachOp
 import BlockId
 import CLabel
 import Unique
-import UniqSet
 
-import Data.Map (Map)
+import Data.Set (Set)
+import qualified Data.Set as Set
 
 -----------------------------------------------------------------------------
 --		CmmExpr
@@ -42,11 +42,12 @@ import Data.Map (Map)
 
 data CmmExpr
   = CmmLit CmmLit               -- Literal
-  | CmmLoad CmmExpr CmmType     -- Read memory location
-  | CmmReg CmmReg		-- Contents of register
+  | CmmLoad !CmmExpr !CmmType   -- Read memory location
+  | CmmReg !CmmReg              -- Contents of register
   | CmmMachOp MachOp [CmmExpr]  -- Machine operation (+, -, *, etc.)
-  | CmmStackSlot Area Int       -- addressing expression of a stack slot
-  | CmmRegOff CmmReg Int	
+  | CmmStackSlot Area {-# UNPACK #-} !Int
+                                -- addressing expression of a stack slot
+  | CmmRegOff !CmmReg Int
 	-- CmmRegOff reg i
 	--        ** is shorthand only, meaning **
 	-- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
@@ -62,20 +63,16 @@ instance Eq CmmExpr where	-- Equality ignores the types
   _e1               == _e2               = False
 
 data CmmReg 
-  = CmmLocal  LocalReg
+  = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
   deriving( Eq, Ord )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
 data Area
-  = RegSlot  LocalReg
-  | CallArea AreaId
-  deriving (Eq, Ord)
-
-data AreaId
   = Old            -- See Note [Old Area]
-  | Young BlockId
+  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
+                   -- See Note [Continuation BlockId] in CmmNode.
   deriving (Eq, Ord)
 
 {- Note [Old Area] 
@@ -94,15 +91,8 @@ necessarily at the young end of the Old area.
 
 End of note -}
 
-type SubArea    = (Area, Int, Int) -- area, offset, width
-type SubAreaSet = Map Area [SubArea]
-
-type AreaMap    = Map Area Int
-     -- Byte offset of the oldest byte of the Area, 
-     -- relative to the oldest byte of the Old Area
-
 data CmmLit
-  = CmmInt Integer  Width
+  = CmmInt !Integer  Width
 	-- Interpretation: the 2's complement representation of the value
 	-- is truncated to the specified size.  This is easier than trying
 	-- to keep the value within range, because we don't know whether
@@ -120,7 +110,11 @@ data CmmLit
         -- It is also used inside the NCG during when generating
         -- position-independent code. 
   | CmmLabelDiffOff CLabel CLabel Int   -- label1 - label2 + offset
-  | CmmBlock BlockId			-- Code label
+
+  | CmmBlock {-# UNPACK #-} !BlockId     -- Code label
+        -- Invariant: must be a continuation BlockId
+        -- See Note [Continuation BlockId] in CmmNode.
+
   | CmmHighStackMark -- stands for the max stack space used during a procedure
   deriving Eq
 
@@ -163,7 +157,7 @@ maybeInvertCmmExpr _ = Nothing
 -----------------------------------------------------------------------------
 
 data LocalReg
-  = LocalReg !Unique CmmType
+  = LocalReg {-# UNPACK #-} !Unique CmmType
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
@@ -189,22 +183,35 @@ localRegType (LocalReg _ rep) = rep
 -----------------------------------------------------------------------------
 
 -- | Sets of local registers
-type RegSet              =  UniqSet LocalReg
+
+-- These are used for dataflow facts, and a common operation is taking
+-- the union of two RegSets and then asking whether the union is the
+-- same as one of the inputs.  UniqSet isn't good here, because
+-- sizeUniqSet is O(n) whereas Set.size is O(1), so we use ordinary
+-- Sets.
+
+type RegSet              =  Set LocalReg
 emptyRegSet             :: RegSet
+nullRegSet              :: RegSet -> Bool
 elemRegSet              :: LocalReg -> RegSet -> Bool
 extendRegSet            :: RegSet -> LocalReg -> RegSet
 deleteFromRegSet        :: RegSet -> LocalReg -> RegSet
 mkRegSet                :: [LocalReg] -> RegSet
 minusRegSet, plusRegSet, timesRegSet :: RegSet -> RegSet -> RegSet
-
-emptyRegSet      = emptyUniqSet
-elemRegSet       = elementOfUniqSet
-extendRegSet     = addOneToUniqSet
-deleteFromRegSet = delOneFromUniqSet
-mkRegSet         = mkUniqSet
-minusRegSet      = minusUniqSet
-plusRegSet       = unionUniqSets
-timesRegSet      = intersectUniqSets
+sizeRegSet              :: RegSet -> Int
+regSetToList            :: RegSet -> [LocalReg]
+
+emptyRegSet      = Set.empty
+nullRegSet       = Set.null
+elemRegSet       = Set.member
+extendRegSet     = flip Set.insert
+deleteFromRegSet = flip Set.delete
+mkRegSet         = Set.fromList
+minusRegSet      = Set.difference
+plusRegSet       = Set.union
+timesRegSet      = Set.intersection
+sizeRegSet       = Set.size
+regSetToList     = Set.toList
 
 class UserOfLocalRegs a where
   foldRegsUsed :: (b -> LocalReg -> b) -> b -> a -> b
@@ -236,7 +243,7 @@ instance DefinerOfLocalRegs LocalReg where
     foldRegsDefd f z r = f z r
 
 instance UserOfLocalRegs RegSet where
-    foldRegsUsed f = foldUniqSet (flip f)
+    foldRegsUsed f = Set.fold (flip f)
 
 instance UserOfLocalRegs CmmExpr where
   foldRegsUsed f z e = expr z e
@@ -271,49 +278,6 @@ reg `regUsedIn` CmmMachOp _ es   = any (reg `regUsedIn`) es
 _   `regUsedIn` CmmStackSlot _ _ = False
 
 -----------------------------------------------------------------------------
---    Stack slots
------------------------------------------------------------------------------
-
-isStackSlotOf :: CmmExpr -> LocalReg -> Bool
-isStackSlotOf (CmmStackSlot (RegSlot r) _) r' = r == r'
-isStackSlotOf _ _ = False
-
-regSlot :: LocalReg -> CmmExpr
-regSlot r = CmmStackSlot (RegSlot r) (widthInBytes $ typeWidth $ localRegType r)
-
------------------------------------------------------------------------------
---    Stack slot use information for expressions and other types [_$_]
------------------------------------------------------------------------------
-
--- Fold over the area, the offset into the area, and the width of the subarea.
-class UserOfSlots a where
-  foldSlotsUsed :: (b -> SubArea -> b) -> b -> a -> b
-
-class DefinerOfSlots a where
-  foldSlotsDefd :: (b -> SubArea -> b) -> b -> a -> b
-
-instance UserOfSlots CmmExpr where
-  foldSlotsUsed f z e = expr z e
-    where expr z (CmmLit _)          = z
-          expr z (CmmLoad (CmmStackSlot a i) ty) = f z (a, i, widthInBytes $ typeWidth ty)
-          expr z (CmmLoad addr _)    = foldSlotsUsed f z addr
-          expr z (CmmReg _)          = z
-          expr z (CmmMachOp _ exprs) = foldSlotsUsed f z exprs
-          expr z (CmmRegOff _ _)     = z
-          expr z (CmmStackSlot _ _)  = z
-
-instance UserOfSlots a => UserOfSlots [a] where
-  foldSlotsUsed _ set [] = set
-  foldSlotsUsed f set (x:xs) = foldSlotsUsed f (foldSlotsUsed f set x) xs
-
-instance DefinerOfSlots a => DefinerOfSlots [a] where
-  foldSlotsDefd _ set [] = set
-  foldSlotsDefd f set (x:xs) = foldSlotsDefd f (foldSlotsDefd f set x) xs
-
-instance DefinerOfSlots SubArea where
-    foldSlotsDefd f z a = f z a
-
------------------------------------------------------------------------------
 --		Global STG registers
 -----------------------------------------------------------------------------
 
diff --git a/compiler/cmm/CmmInfo.hs b/compiler/cmm/CmmInfo.hs
index fe0c104d1c..a171faa057 100644
--- a/compiler/cmm/CmmInfo.hs
+++ b/compiler/cmm/CmmInfo.hs
@@ -19,6 +19,8 @@ import CmmUtils
 import CLabel
 import SMRep
 import Bitmap
+import Stream (Stream)
+import qualified Stream
 
 import Maybes
 import Constants
@@ -40,10 +42,16 @@ mkEmptyContInfoTable info_lbl
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-cmmToRawCmm :: Platform -> [Old.CmmGroup] -> IO [Old.RawCmmGroup]
+cmmToRawCmm :: Platform -> Stream IO Old.CmmGroup ()
+            -> IO (Stream IO Old.RawCmmGroup ())
 cmmToRawCmm platform cmms
   = do { uniqs <- mkSplitUniqSupply 'i'
-       ; return (initUs_ uniqs (mapM (concatMapM (mkInfoTable platform)) cmms)) }
+       ; let do_one uniqs cmm = do
+                case initUs uniqs $ concatMapM (mkInfoTable platform) cmm of
+                  (b,uniqs') -> return (uniqs',b)
+                  -- NB. strictness fixes a space leak.  DO NOT REMOVE.
+       ; return (Stream.mapAccumL do_one uniqs cmms >> return ())
+       }
 
 -- Make a concrete info table, represented as a list of CmmStatic
 -- (it can't be simply a list of Word, because the SRT field is
@@ -82,7 +90,7 @@ mkInfoTable :: Platform -> CmmDecl -> UniqSM [RawCmmDecl]
 mkInfoTable _ (CmmData sec dat) 
   = return [CmmData sec dat]
 
-mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
+mkInfoTable platform (CmmProc info entry_label blocks)
   | CmmNonInfoTable <- info   -- Code without an info table.  Easy.
   = return [CmmProc Nothing entry_label blocks]
                                
@@ -91,7 +99,8 @@ mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
        ; return (top_decls  ++
                  mkInfoTableAndCode info_lbl info_cts
                                     entry_label blocks) }
-  | otherwise = panic "mkInfoTable"  -- Patern match overlap check not clever enough
+  | otherwise = panic "mkInfoTable"
+                  -- Patern match overlap check not clever enough
 
 -----------------------------------------------------
 type InfoTableContents = ( [CmmLit]	     -- The standard part
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
new file mode 100644
index 0000000000..3ee06215bc
--- /dev/null
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -0,0 +1,1049 @@
+{-# LANGUAGE RecordWildCards, GADTs #-}
+module CmmLayoutStack (
+       cmmLayoutStack, setInfoTableStackMap, cmmSink
+  ) where
+
+import StgCmmUtils      ( callerSaveVolatileRegs ) -- XXX
+import StgCmmForeign    ( saveThreadState, loadThreadState ) -- XXX
+
+import Cmm
+import BlockId
+import CLabel
+import CmmUtils
+import MkGraph
+import Module
+import ForeignCall
+import CmmLive
+import CmmProcPoint
+import SMRep
+import Hoopl hiding ((<*>), mkLast, mkMiddle)
+import Constants
+import UniqSupply
+import Maybes
+import UniqFM
+import Util
+
+import FastString
+import Outputable
+import Data.Map (Map)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import Control.Monad.Fix
+import Data.Array as Array
+import Data.Bits
+import Data.List (nub, partition)
+import Control.Monad (liftM)
+
+#include "HsVersions.h"
+
+
+data StackSlot = Occupied | Empty
+     -- Occupied: a return address or part of an update frame
+
+instance Outputable StackSlot where
+  ppr Occupied = ptext (sLit "XXX")
+  ppr Empty    = ptext (sLit "---")
+
+-- All stack locations are expressed as positive byte offsets from the
+-- "base", which is defined to be the address above the return address
+-- on the stack on entry to this CmmProc.
+--
+-- Lower addresses have higher StackLocs.
+--
+type StackLoc = ByteOff
+
+{-
+ A StackMap describes the stack at any given point.  At a continuation
+ it has a particular layout, like this:
+
+         |             | <- base
+         |-------------|
+         |     ret0    | <- base + 8
+         |-------------|
+         .  upd frame  . <- base + sm_ret_off
+         |-------------|
+         |             |
+         .    vars     .
+         . (live/dead) .
+         |             | <- base + sm_sp - sm_args
+         |-------------|
+         |    ret1     |
+         .  ret vals   . <- base + sm_sp    (<--- Sp points here)
+         |-------------|
+
+Why do we include the final return address (ret0) in our stack map?  I
+have absolutely no idea, but it seems to be done that way consistently
+in the rest of the code generator, so I played along here. --SDM
+
+Note that we will be constructing an info table for the continuation
+(ret1), which needs to describe the stack down to, but not including,
+the update frame (or ret0, if there is no update frame).
+-}
+
+data StackMap = StackMap
+ {  sm_sp   :: StackLoc
+       -- ^ the offset of Sp relative to the base on entry
+       -- to this block.
+ ,  sm_args :: ByteOff
+       -- ^ the number of bytes of arguments in the area for this block
+       -- Defn: the offset of young(L) relative to the base is given by
+       -- (sm_sp - sm_args) of the StackMap for block L.
+ ,  sm_ret_off :: ByteOff
+       -- ^ Number of words of stack that we do not describe with an info
+       -- table, because it contains an update frame.
+ ,  sm_regs :: UniqFM (LocalReg,StackLoc)
+       -- ^ regs on the stack
+ }
+
+instance Outputable StackMap where
+  ppr StackMap{..} =
+     text "Sp = " <> int sm_sp $$
+     text "sm_args = " <> int sm_args $$
+     text "sm_ret_off = " <> int sm_ret_off $$
+     text "sm_regs = " <> ppr (eltsUFM sm_regs)
+
+
+cmmLayoutStack :: ProcPointSet -> ByteOff -> CmmGraph
+               -> UniqSM (CmmGraph, BlockEnv StackMap)
+cmmLayoutStack procpoints entry_args
+               graph0@(CmmGraph { g_entry = entry })
+  = do
+    pprTrace "cmmLayoutStack" (ppr entry_args) $ return ()
+    (graph, liveness) <- removeDeadAssignments graph0
+    pprTrace "liveness" (ppr liveness) $ return ()
+    let blocks = postorderDfs graph
+
+    (final_stackmaps, final_high_sp, new_blocks) <-
+          mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
+            layout procpoints liveness entry entry_args
+                   rec_stackmaps rec_high_sp blocks
+
+    new_blocks' <- mapM lowerSafeForeignCall new_blocks
+
+    pprTrace ("Sp HWM") (ppr final_high_sp) $
+       return (ofBlockList entry new_blocks', final_stackmaps)
+
+
+
+layout :: BlockSet                      -- proc points
+       -> BlockEnv CmmLive              -- liveness
+       -> BlockId                       -- entry
+       -> ByteOff                       -- stack args on entry
+
+       -> BlockEnv StackMap             -- [final] stack maps
+       -> ByteOff                       -- [final] Sp high water mark
+
+       -> [CmmBlock]                    -- [in] blocks
+
+       -> UniqSM
+          ( BlockEnv StackMap           -- [out] stack maps
+          , ByteOff                     -- [out] Sp high water mark
+          , [CmmBlock]                  -- [out] new blocks
+          )
+
+layout procpoints liveness entry entry_args final_stackmaps final_hwm blocks
+  = go blocks init_stackmap entry_args []
+  where
+    (updfr, cont_info)  = collectContInfo blocks
+
+    init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
+                                               , sm_args = entry_args
+                                               , sm_ret_off = updfr
+                                               , sm_regs = emptyUFM
+                                               }
+
+    go [] acc_stackmaps acc_hwm acc_blocks
+      = return (acc_stackmaps, acc_hwm, acc_blocks)
+
+    go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
+      = do
+       let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
+    
+       let stack0@StackMap { sm_sp = sp0 }
+               = mapFindWithDefault
+                     (pprPanic "no stack map for" (ppr entry_lbl))
+                     entry_lbl acc_stackmaps
+    
+       pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
+    
+       -- (a) Update the stack map to include the effects of
+       --     assignments in this block
+       let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
+    
+       -- (b) Insert assignments to reload all the live variables if this
+       --     block is a proc point
+       let middle1 = if entry_lbl `setMember` procpoints
+                        then foldr blockCons middle0 (insertReloads stack0)
+                        else middle0
+    
+       -- (c) Look at the last node and if we are making a call or
+       --     jumping to a proc point, we must save the live
+       --     variables, adjust Sp, and construct the StackMaps for
+       --     each of the successor blocks.  See handleLastNode for
+       --     details.
+       (middle2, sp_off, last1, fixup_blocks, out)
+           <- handleLastNode procpoints liveness cont_info
+                             acc_stackmaps stack1 middle0 last0
+    
+       pprTrace "layout(out)" (ppr out) $ return ()
+
+       -- (d) Manifest Sp: run over the nodes in the block and replace
+       --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
+       --
+       -- our block:
+       --    middle1          -- the original middle nodes
+       --    middle2          -- live variable saves from handleLastNode
+       --    Sp = Sp + sp_off -- Sp adjustment goes here
+       --    last1            -- the last node
+       --
+       let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
+
+           sp_high = final_hwm - entry_args
+              -- The stack check value is adjusted by the Sp offset on
+              -- entry to the proc, which is entry_args.  We are
+              -- assuming that we only do a stack check at the
+              -- beginning of a proc, and we don't modify Sp before the
+              -- check.
+
+           final_blocks = manifestSp final_stackmaps stack0 sp0 sp_high entry0
+                              middle_pre sp_off last1 fixup_blocks
+
+           acc_stackmaps' = mapUnion acc_stackmaps out
+
+           hwm' = maximum (acc_hwm : (sp0 - sp_off) : map sm_sp (mapElems out))
+
+       go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
+
+
+-- -----------------------------------------------------------------------------
+
+-- This doesn't seem right somehow.  We need to find out whether this
+-- proc will push some update frame material at some point, so that we
+-- can avoid using that area of the stack for spilling.  The
+-- updfr_space field of the CmmProc *should* tell us, but it doesn't
+-- (I think maybe it gets filled in later when we do proc-point
+-- splitting).
+--
+-- So we'll just take the max of all the cml_ret_offs.  This could be
+-- unnecessarily pessimistic, but probably not in the code we
+-- generate.
+
+collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
+collectContInfo blocks
+  = (maximum ret_offs, mapFromList (catMaybes mb_argss))
+ where
+  (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
+
+  get_cont b =
+     case lastNode b of
+        CmmCall { cml_cont = Just l, .. }
+           -> (Just (l, cml_ret_args), cml_ret_off)
+        CmmForeignCall { .. }
+           -> (Just (succ, 0), updfr) -- ??
+        _other -> (Nothing, 0)
+
+
+-- -----------------------------------------------------------------------------
+-- Updating the StackMap from middle nodes
+
+-- Look for loads from stack slots, and update the StackMap.  This is
+-- purely for optimisation reasons, so that we can avoid saving a
+-- variable back to a different stack slot if it is already on the
+-- stack.
+--
+-- This happens a lot: for example when function arguments are passed
+-- on the stack and need to be immediately saved across a call, we
+-- want to just leave them where they are on the stack.
+--
+procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
+procMiddle stackmaps node sm
+  = case node of
+     CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
+       -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
+        where loc = getStackLoc area off stackmaps
+     CmmAssign (CmmLocal r) _other
+       -> sm { sm_regs = delFromUFM (sm_regs sm) r }
+     _other
+       -> sm
+
+getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
+getStackLoc Old       n _         = n
+getStackLoc (Young l) n stackmaps =
+  case mapLookup l stackmaps of
+    Nothing -> pprPanic "getStackLoc" (ppr l)
+    Just sm -> sm_sp sm - sm_args sm + n
+
+
+-- -----------------------------------------------------------------------------
+-- Handling stack allocation for a last node
+
+-- We take a single last node and turn it into:
+--
+--    C1 (some statements)
+--    Sp = Sp + N
+--    C2 (some more statements)
+--    call f()          -- the actual last node
+--
+-- plus possibly some more blocks (we may have to add some fixup code
+-- between the last node and the continuation).
+--
+-- C1: is the code for saving the variables across this last node onto
+-- the stack, if the continuation is a call or jumps to a proc point.
+--
+-- C2: if the last node is a safe foreign call, we have to inject some
+-- extra code that goes *after* the Sp adjustment.
+
+handleLastNode
+   :: ProcPointSet -> BlockEnv CmmLive -> BlockEnv ByteOff
+   -> BlockEnv StackMap -> StackMap
+   -> Block CmmNode O O
+   -> CmmNode O C
+   -> UniqSM
+      ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
+      , ByteOff            -- amount to adjust Sp
+      , CmmNode O C        -- new last node
+      , [CmmBlock]         -- new blocks
+      , BlockEnv StackMap  -- stackmaps for the continuations
+      )
+
+handleLastNode procpoints liveness cont_info stackmaps
+               stack0@StackMap { sm_sp = sp0 } middle last
+ = case last of
+    --  At each return / tail call,
+    --  adjust Sp to point to the last argument pushed, which
+    --  is cml_args, after popping any other junk from the stack.
+    CmmCall{ cml_cont = Nothing, .. } -> do
+      let sp_off = sp0 - cml_args
+      return ([], sp_off, last, [], mapEmpty)
+
+    --  At each CmmCall with a continuation:
+    CmmCall{ cml_cont = Just cont_lbl, .. } ->
+       return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
+
+    CmmForeignCall{ succ = cont_lbl, .. } -> do
+       return $ lastCall cont_lbl wORD_SIZE wORD_SIZE (sm_ret_off stack0)
+            -- one word each for args and results: the return address
+
+    CmmBranch{..}     ->  handleProcPoints
+    CmmCondBranch{..} ->  handleProcPoints
+    CmmSwitch{..}     ->  handleProcPoints
+
+  where
+     -- Calls and ForeignCalls are handled the same way:
+     lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
+              -> ( [CmmNode O O]
+                 , ByteOff
+                 , CmmNode O C
+                 , [CmmBlock]
+                 , BlockEnv StackMap
+                 )
+     lastCall lbl cml_args cml_ret_args cml_ret_off
+      =  ( assignments
+         , spOffsetForCall sp0 cont_stack cml_args
+         , last
+         , [] -- no new blocks
+         , mapSingleton lbl cont_stack )
+      where
+         (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
+
+
+     prepareStack lbl cml_ret_args cml_ret_off
+       | Just cont_stack <- mapLookup lbl stackmaps
+             -- If we have already seen this continuation before, then
+             -- we just have to make the stack look the same:
+       = (fixupStack stack0 cont_stack, cont_stack)
+             -- Otherwise, we have to allocate the stack frame
+       | otherwise
+       = (save_assignments, new_cont_stack)
+       where
+        (new_cont_stack, save_assignments)
+           = setupStackFrame lbl liveness cml_ret_off cml_ret_args stack0
+
+
+     -- For other last nodes (branches), if any of the targets is a
+     -- proc point, we have to set up the stack to match what the proc
+     -- point is expecting.
+     --
+     handleProcPoints :: UniqSM ( [CmmNode O O]
+                                , ByteOff
+                                , CmmNode O C
+                                , [CmmBlock]
+                                , BlockEnv StackMap )
+
+     handleProcPoints
+         -- Note [diamond proc point]
+       | Just l <- futureContinuation middle
+       , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
+       = do
+         let cont_args = mapFindWithDefault 0 l cont_info
+             (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
+             out = mapFromList [ (l', cont_stack)
+                               | l' <- successors last ]
+         return ( assigs
+                , spOffsetForCall sp0 cont_stack wORD_SIZE
+                , last
+                , []
+                , out)
+
+        | otherwise = do
+          pps <- mapM handleProcPoint (successors last)
+          let lbl_map :: LabelMap Label
+              lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
+              fix_lbl l = mapLookup l lbl_map `orElse` l
+          return ( []
+                 , 0
+                 , mapSuccessors fix_lbl last
+                 , concat [ blk | (_,_,_,blk) <- pps ]
+                 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
+
+     -- For each proc point that is a successor of this block
+     --   (a) if the proc point already has a stackmap, we need to
+     --       shuffle the current stack to make it look the same.
+     --       We have to insert a new block to make this happen.
+     --   (b) otherwise, call "allocate live stack0" to make the
+     --       stack map for the proc point
+     handleProcPoint :: BlockId
+                     -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
+     handleProcPoint l
+        | not (l `setMember` procpoints) = return (l, l, stack0, [])
+        | otherwise = do
+           tmp_lbl <- liftM mkBlockId $ getUniqueM
+           let
+               (stack2, assigs) =
+                  case mapLookup l stackmaps of
+                    Just pp_sm -> (pp_sm, fixupStack stack0 pp_sm)
+                    Nothing    ->
+                      pprTrace "first visit to proc point"
+                                   (ppr l <+> ppr stack1) $
+                      (stack1, assigs)
+                      where
+                       cont_args = mapFindWithDefault 0 l cont_info
+                       (stack1, assigs) =
+                           setupStackFrame l liveness (sm_ret_off stack0)
+                                                       cont_args stack0
+
+               sp_off = sp0 - sm_sp stack2
+
+               block = blockJoin (CmmEntry tmp_lbl)
+                                 (maybeAddSpAdj sp_off (blockFromList assigs))
+                                 (CmmBranch l)
+           --
+           return (l, tmp_lbl, stack2, [block])
+
+
+  
+-- Sp is currently pointing to current_sp,
+-- we want it to point to
+--    (sm_sp cont_stack - sm_args cont_stack + args)
+-- so the difference is
+--    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
+spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
+spOffsetForCall current_sp cont_stack args
+  = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
+
+
+-- | create a sequence of assignments to establish the new StackMap,
+-- given the old StackMap.
+fixupStack :: StackMap -> StackMap -> [CmmNode O O]
+fixupStack old_stack new_stack = concatMap move new_locs
+ where
+     old_map :: Map LocalReg ByteOff
+     old_map  = Map.fromList (stackSlotRegs old_stack)
+     new_locs = stackSlotRegs new_stack
+
+     move (r,n)
+       | Just m <- Map.lookup r old_map, n == m = []
+       | otherwise = [CmmStore (CmmStackSlot Old n)
+                               (CmmReg (CmmLocal r))]
+
+
+
+setupStackFrame
+             :: BlockId                 -- label of continuation
+             -> BlockEnv CmmLive        -- liveness
+             -> ByteOff      -- updfr
+             -> ByteOff      -- bytes of return values on stack
+             -> StackMap     -- current StackMap
+             -> (StackMap, [CmmNode O O])
+
+setupStackFrame lbl liveness updfr_off ret_args stack0
+  = (cont_stack, assignments)
+  where
+      -- get the set of LocalRegs live in the continuation
+      live = mapFindWithDefault Set.empty lbl liveness
+
+      -- the stack from the base to updfr_off is off-limits.
+      -- our new stack frame contains:
+      --   * saved live variables
+      --   * the return address [young(C) + 8]
+      --   * the args for the call,
+      --     which are replaced by the return values at the return
+      --     point.
+
+      -- everything up to updfr_off is off-limits
+      -- stack1 contains updfr_off, plus everything we need to save
+      (stack1, assignments) = allocate updfr_off live stack0
+
+      -- And the Sp at the continuation is:
+      --   sm_sp stack1 + ret_args
+      cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
+                         , sm_args = ret_args
+                         , sm_ret_off = updfr_off
+                         }
+
+
+-- -----------------------------------------------------------------------------
+-- Note [diamond proc point]
+--
+-- This special case looks for the pattern we get from a typical
+-- tagged case expression:
+--
+--    Sp[young(L1)] = L1
+--    if (R1 & 7) != 0 goto L1 else goto L2
+--  L2:
+--    call [R1] returns to L1
+--  L1: live: {y}
+--    x = R1
+--
+-- If we let the generic case handle this, we get
+--
+--    Sp[-16] = L1
+--    if (R1 & 7) != 0 goto L1a else goto L2
+--  L2:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    call [R1] returns to L1
+--  L1a:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    goto L1
+--  L1:
+--    x = R1
+--
+-- The code for saving the live vars is duplicated in each branch, and
+-- furthermore there is an extra jump in the fast path (assuming L1 is
+-- a proc point, which it probably is if there is a heap check).
+--
+-- So to fix this we want to set up the stack frame before the
+-- conditional jump.  How do we know when to do this, and when it is
+-- safe?  The basic idea is, when we see the assignment
+-- 
+--   Sp[young(L)] = L
+-- 
+-- we know that
+--   * we are definitely heading for L
+--   * there can be no more reads from another stack area, because young(L)
+--     overlaps with it.
+--
+-- We don't necessarily know that everything live at L is live now
+-- (some might be assigned between here and the jump to L).  So we
+-- simplify and only do the optimisation when we see
+--
+--   (1) a block containing an assignment of a return address L
+--   (2) ending in a branch where one (and only) continuation goes to L,
+--       and no other continuations go to proc points.
+--
+-- then we allocate the stack frame for L at the end of the block,
+-- before the branch.
+--
+-- We could generalise (2), but that would make it a bit more
+-- complicated to handle, and this currently catches the common case.
+
+futureContinuation :: Block CmmNode O O -> Maybe BlockId
+futureContinuation middle = foldBlockNodesB f middle Nothing
+   where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
+         f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
+               = Just l
+         f _ r = r
+
+-- -----------------------------------------------------------------------------
+-- Saving live registers
+
+-- | Given a set of live registers and a StackMap, save all the registers
+-- on the stack and return the new StackMap and the assignments to do
+-- the saving.
+--
+allocate :: ByteOff -> RegSet -> StackMap -> (StackMap, [CmmNode O O])
+allocate ret_off live stackmap@StackMap{ sm_sp = sp0
+                                       , sm_regs = regs0 }
+ =
+  pprTrace "allocate" (ppr live $$ ppr stackmap) $
+
+   -- we only have to save regs that are not already in a slot
+   let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
+       regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
+   in
+
+   -- make a map of the stack
+   let stack = reverse $ Array.elems $
+               accumArray (\_ x -> x) Empty (1, toWords (max sp0 ret_off)) $
+                 ret_words ++ live_words
+            where ret_words =
+                   [ (x, Occupied)
+                   | x <- [ 1 .. toWords ret_off] ]
+                  live_words =
+                   [ (toWords x, Occupied)
+                   | (r,off) <- eltsUFM regs1,
+                     let w = localRegBytes r,
+                     x <- [ off, off-wORD_SIZE .. off - w + 1] ]
+   in
+
+   -- Pass over the stack: find slots to save all the new live variables,
+   -- choosing the oldest slots first (hence a foldr).
+   let
+       save slot ([], stack, n, assigs, regs) -- no more regs to save
+          = ([], slot:stack, n `plusW` 1, assigs, regs)
+       save slot (to_save, stack, n, assigs, regs)
+          = case slot of
+               Occupied ->  (to_save, Occupied:stack, n `plusW` 1, assigs, regs)
+               Empty
+                 | Just (stack', r, to_save') <-
+                       select_save to_save (slot:stack)
+                 -> let assig = CmmStore (CmmStackSlot Old n')
+                                         (CmmReg (CmmLocal r))
+                        n' = n `plusW` 1
+                   in
+                        (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
+
+                 | otherwise
+                 -> (to_save, slot:stack, n `plusW` 1, assigs, regs)
+
+       -- we should do better here: right now we'll fit the smallest first,
+       -- but it would make more sense to fit the biggest first.
+       select_save :: [LocalReg] -> [StackSlot]
+                   -> Maybe ([StackSlot], LocalReg, [LocalReg])
+       select_save regs stack = go regs []
+         where go []     _no_fit = Nothing
+               go (r:rs) no_fit
+                 | Just rest <- dropEmpty words stack
+                 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
+                 | otherwise
+                 = go rs (r:no_fit)
+                 where words = localRegWords r
+
+       -- fill in empty slots as much as possible
+       (still_to_save, save_stack, n, save_assigs, save_regs)
+          = foldr save (to_save, [], 0, [], []) stack
+
+       -- push any remaining live vars on the stack
+       (push_sp, push_assigs, push_regs)
+          = foldr push (n, [], []) still_to_save
+          where
+              push r (n, assigs, regs)
+                = (n', assig : assigs, (r,(r,n')) : regs)
+                where
+                  n' = n + localRegBytes r
+                  assig = CmmStore (CmmStackSlot Old n')
+                                   (CmmReg (CmmLocal r))
+
+       trim_sp
+          | not (null push_regs) = push_sp
+          | otherwise
+          = n `plusW` (- length (takeWhile isEmpty save_stack))
+
+       final_regs = regs1 `addListToUFM` push_regs
+                          `addListToUFM` save_regs
+
+   in
+  -- XXX should be an assert
+   if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
+
+   if (trim_sp .&. (wORD_SIZE - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
+
+   ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
+   , push_assigs ++ save_assigs )
+
+
+-- -----------------------------------------------------------------------------
+-- Manifesting Sp
+
+-- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
+-- block looks like this:
+--
+--    middle_pre       -- the middle nodes
+--    Sp = Sp + sp_off -- Sp adjustment goes here
+--    last             -- the last node
+--
+-- And we have some extra blocks too (that don't contain Sp adjustments)
+--
+-- The adjustment for middle_pre will be different from that for
+-- middle_post, because the Sp adjustment intervenes.
+--
+manifestSp
+   :: BlockEnv StackMap  -- StackMaps for other blocks
+   -> StackMap           -- StackMap for this block
+   -> ByteOff            -- Sp on entry to the block
+   -> ByteOff            -- SpHigh
+   -> CmmNode C O        -- first node
+   -> [CmmNode O O]      -- middle
+   -> ByteOff            -- sp_off
+   -> CmmNode O C        -- last node
+   -> [CmmBlock]         -- new blocks
+   -> [CmmBlock]         -- final blocks with Sp manifest
+
+manifestSp stackmaps stack0 sp0 sp_high
+           first middle_pre sp_off last fixup_blocks
+  = final_block : fixup_blocks'
+  where
+    area_off = getAreaOff stackmaps
+
+    adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
+    adj_pre_sp  = mapExpDeep (areaToSp sp0            sp_high area_off)
+    adj_post_sp = mapExpDeep (areaToSp (sp0 - sp_off) sp_high area_off)
+
+    final_middle = maybeAddSpAdj sp_off $
+                   blockFromList $
+                   map adj_pre_sp $
+                   elimStackStores stack0 stackmaps area_off $
+                   middle_pre
+
+    final_last    = optStackCheck (adj_post_sp last)
+
+    final_block   = blockJoin first final_middle final_last
+
+    fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
+
+
+getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
+getAreaOff _ Old = 0
+getAreaOff stackmaps (Young l) =
+  case mapLookup l stackmaps of
+    Just sm -> sm_sp sm - sm_args sm
+    Nothing -> pprPanic "getAreaOff" (ppr l)
+
+
+maybeAddSpAdj :: ByteOff -> Block CmmNode O O -> Block CmmNode O O
+maybeAddSpAdj 0      block = block
+maybeAddSpAdj sp_off block
+   = block `blockSnoc` CmmAssign spReg (cmmOffset (CmmReg spReg) sp_off)
+
+
+{-
+Sp(L) is the Sp offset on entry to block L relative to the base of the
+OLD area.
+
+SpArgs(L) is the size of the young area for L, i.e. the number of
+arguments.
+
+ - in block L, each reference to [old + N] turns into
+   [Sp + Sp(L) - N]
+
+ - in block L, each reference to [young(L') + N] turns into
+   [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
+
+ - be careful with the last node of each block: Sp has already been adjusted
+   to be Sp + Sp(L) - Sp(L')
+-}
+
+areaToSp :: ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
+areaToSp sp_old _sp_hwm area_off (CmmStackSlot area n) =
+  cmmOffset (CmmReg spReg) (sp_old - area_off area - n)
+areaToSp _ sp_hwm _ (CmmLit CmmHighStackMark) = CmmLit (mkIntCLit sp_hwm)
+areaToSp _ _ _ (CmmMachOp (MO_U_Lt _)  -- Note [null stack check]
+                      [CmmMachOp (MO_Sub _)
+                              [ CmmReg (CmmGlobal Sp)
+                              , CmmLit (CmmInt 0 _)],
+                       CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
+areaToSp _ _ _ other = other
+
+-- -----------------------------------------------------------------------------
+-- Note [null stack check]
+--
+-- If the high-water Sp is zero, then we end up with
+--
+--   if (Sp - 0 < SpLim) then .. else ..
+--
+-- and possibly some dead code for the failure case.  Optimising this
+-- away depends on knowing that SpLim <= Sp, so it is really the job
+-- of the stack layout algorithm, hence we do it now.  This is also
+-- convenient because control-flow optimisation later will drop the
+-- dead code.
+
+optStackCheck :: CmmNode O C -> CmmNode O C
+optStackCheck n = -- Note [null stack check]
+ case n of
+   CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
+   other -> other
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Eliminate stores of the form
+--
+--    Sp[area+n] = r
+--
+-- when we know that r is already in the same slot as Sp[area+n].  We
+-- could do this in a later optimisation pass, but that would involve
+-- a separate analysis and we already have the information to hand
+-- here.  It helps clean up some extra stack stores in common cases.
+--
+-- Note that we may have to modify the StackMap as we walk through the
+-- code using procMiddle, since an assignment to a variable in the
+-- StackMap will invalidate its mapping there.
+--
+elimStackStores :: StackMap
+                -> BlockEnv StackMap
+                -> (Area -> ByteOff)
+                -> [CmmNode O O]
+                -> [CmmNode O O]
+elimStackStores stackmap stackmaps area_off nodes
+  = go stackmap nodes
+  where
+    go _stackmap [] = []
+    go stackmap (n:ns)
+     = case n of
+         CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
+            | Just (_,off) <- lookupUFM (sm_regs stackmap) r
+            , area_off area + m == off
+            -> pprTrace "eliminated a node!" (ppr r) $ go stackmap ns
+         _otherwise
+            -> n : go (procMiddle stackmaps n stackmap) ns
+
+
+-- -----------------------------------------------------------------------------
+-- Update info tables to include stack liveness
+
+
+setInfoTableStackMap :: BlockEnv StackMap -> CmmDecl -> CmmDecl
+setInfoTableStackMap stackmaps
+    (CmmProc top_info@TopInfo{..} l g@CmmGraph{g_entry = eid})
+  = CmmProc top_info{ info_tbl = fix_info info_tbl } l g
+  where
+    fix_info info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
+       info_tbl { cit_rep = StackRep (get_liveness eid) }
+    fix_info other = other
+
+    get_liveness :: BlockId -> Liveness
+    get_liveness lbl
+      = case mapLookup lbl stackmaps of
+          Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl)
+          Just sm -> stackMapToLiveness sm
+
+setInfoTableStackMap _ d = d
+
+
+stackMapToLiveness :: StackMap -> Liveness
+stackMapToLiveness StackMap{..} =
+   reverse $ Array.elems $
+        accumArray (\_ x -> x) True (toWords sm_ret_off + 1,
+                                     toWords (sm_sp - sm_args)) live_words
+   where
+     live_words =  [ (toWords off, False)
+                   | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
+
+
+-- -----------------------------------------------------------------------------
+-- Lowering safe foreign calls
+
+{-
+Note [lower safe foreign calls]
+
+We start with
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | r1 = foo(x,y,z) returns to L1
+ '-----------------------
+ L1:
+   R1 = r1 -- copyIn, inserted by mkSafeCall
+   ...
+
+the stack layout algorithm will arrange to save and reload everything
+live across the call.  Our job now is to expand the call so we get
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | SAVE_THREAD_STATE()
+ | token = suspendThread(BaseReg, interruptible)
+ | r = foo(x,y,z)
+ | BaseReg = resumeThread(token)
+ | LOAD_THREAD_STATE()
+ | R1 = r  -- copyOut
+ | jump L1
+ '-----------------------
+ L1:
+   r = R1 -- copyIn, inserted by mkSafeCall
+   ...
+
+Note the copyOut, which saves the results in the places that L1 is
+expecting them (see Note {safe foreign call convention]).
+-}
+
+lowerSafeForeignCall :: CmmBlock -> UniqSM CmmBlock
+lowerSafeForeignCall block
+  | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
+  = do
+    -- Both 'id' and 'new_base' are KindNonPtr because they're
+    -- RTS-only objects and are not subject to garbage collection
+    id <- newTemp bWord
+    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
+    let (caller_save, caller_load) = callerSaveVolatileRegs
+    load_tso <- newTemp gcWord
+    load_stack <- newTemp gcWord
+    let suspend = saveThreadState <*>
+                  caller_save <*>
+                  mkMiddle (callSuspendThread id intrbl)
+        midCall = mkUnsafeCall tgt res args
+        resume  = mkMiddle (callResumeThread new_base id) <*>
+                  -- Assign the result to BaseReg: we
+                  -- might now have a different Capability!
+                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
+                  caller_load <*>
+                  loadThreadState load_tso load_stack
+        -- Note: The successor must be a procpoint, and we have already split,
+        --       so we use a jump, not a branch.
+        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
+
+        (ret_args, regs, copyout) = copyOutOflow NativeReturn Jump (Young succ)
+                                           (map (CmmReg . CmmLocal) res)
+                                           updfr (0, [])
+
+        jump = CmmCall { cml_target    = succLbl
+                       , cml_cont      = Just succ
+                       , cml_args_regs = regs
+                       , cml_args      = widthInBytes wordWidth
+                       , cml_ret_args  = ret_args
+                       , cml_ret_off   = updfr }
+
+    graph' <- lgraphOfAGraph $ suspend <*>
+                               midCall <*>
+                               resume  <*>
+                               copyout <*>
+                               mkLast jump
+
+    case toBlockList graph' of
+      [one] -> let (_, middle', last) = blockSplit one
+               in return (blockJoin entry (middle `blockAppend` middle') last)
+      _ -> panic "lowerSafeForeignCall0"
+
+  -- Block doesn't end in a safe foreign call:
+  | otherwise = return block
+
+
+foreignLbl :: FastString -> CmmExpr
+foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
+
+newTemp :: CmmType -> UniqSM LocalReg
+newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
+
+callSuspendThread :: LocalReg -> Bool -> CmmNode O O
+callSuspendThread id intrbl =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "suspendThread"))
+             (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
+       [id] [CmmReg (CmmGlobal BaseReg), CmmLit (mkIntCLit (fromEnum intrbl))]
+
+callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
+callResumeThread new_base id =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "resumeThread"))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint]))
+       [new_base] [CmmReg (CmmLocal id)]
+
+-- -----------------------------------------------------------------------------
+
+plusW :: ByteOff -> WordOff -> ByteOff
+plusW b w = b + w * wORD_SIZE
+
+dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
+dropEmpty 0 ss           = Just ss
+dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
+dropEmpty _ _            = Nothing
+
+isEmpty :: StackSlot -> Bool
+isEmpty Empty = True
+isEmpty _ = False
+
+localRegBytes :: LocalReg -> ByteOff
+localRegBytes r = roundUpToWords (widthInBytes (typeWidth (localRegType r)))
+
+localRegWords :: LocalReg -> WordOff
+localRegWords = toWords . localRegBytes
+
+toWords :: ByteOff -> WordOff
+toWords x = x `quot` wORD_SIZE
+
+
+insertReloads :: StackMap -> [CmmNode O O]
+insertReloads stackmap =
+   [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
+                                     (localRegType r))
+   | (r,sp) <- stackSlotRegs stackmap
+   ]
+
+
+stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
+stackSlotRegs sm = eltsUFM (sm_regs sm)
+
+-- -----------------------------------------------------------------------------
+
+-- If we do this *before* stack layout, we might be able to avoid
+-- saving some things across calls/procpoints.
+--
+-- *but*, that will invalidate the liveness analysis, and we'll have
+-- to re-do it.
+
+cmmSink :: CmmGraph -> UniqSM CmmGraph
+cmmSink graph = do
+  let liveness = cmmLiveness graph
+  return $ cmmSink' liveness graph
+
+cmmSink' :: BlockEnv CmmLive -> CmmGraph -> CmmGraph
+cmmSink' liveness graph
+  = ofBlockList (g_entry graph) $ sink mapEmpty $ postorderDfs graph
+  where
+
+  sink :: BlockEnv [(LocalReg, CmmExpr)] -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    pprTrace "sink" (ppr l) $
+    blockJoin first final_middle last : sink sunk' bs
+    where
+      l = entryLabel b
+      (first, middle, last) = blockSplit b
+      (middle', assigs) = walk (blockToList middle) emptyBlock
+                               (mapFindWithDefault [] l sunk)
+
+      (dropped_last, assigs') = partition (`conflictsWithLast` last) assigs
+
+      final_middle = foldl blockSnoc middle' (toNodes dropped_last)
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filt assigs' (getLive l))
+                             | l <- successors last ]
+           where
+               getLive l = mapFindWithDefault Set.empty l liveness
+               filt as live = [ (r,e) | (r,e) <- as, r `Set.member` live ]
+
+
+walk :: [CmmNode O O] -> Block CmmNode O O -> [(LocalReg, CmmExpr)]
+     -> (Block CmmNode O O, [(LocalReg, CmmExpr)])
+
+walk []     acc as = (acc, as)
+walk (n:ns) acc as
+  | Just a <- collect_it  = walk ns acc (a:as)
+  | otherwise             = walk ns (foldr (flip blockSnoc) acc (n:drop_nodes)) as'
+  where
+    collect_it = case n of
+                   CmmAssign (CmmLocal r) e@(CmmReg (CmmGlobal _)) -> Just (r,e)
+--                   CmmAssign (CmmLocal r) e@(CmmLoad addr _) |
+--                      foldRegsUsed (\b r -> False) True addr -> Just (r,e)
+                   _ -> Nothing
+
+    drop_nodes = toNodes dropped
+    (dropped, as') = partition should_drop as
+       where should_drop a = a `conflicts` n
+
+toNodes :: [(LocalReg,CmmExpr)] -> [CmmNode O O]
+toNodes as = [ CmmAssign (CmmLocal r) rhs | (r,rhs) <- as ]
+
+-- We only sink "r = G" assignments right now, so conflicts is very simple:
+conflicts :: (LocalReg,CmmExpr) -> CmmNode O O -> Bool
+(_, rhs) `conflicts` CmmAssign reg  _  | reg `regUsedIn` rhs = True
+--(r, CmmLoad _ _) `conflicts` CmmStore _ _ = True
+(r, _)   `conflicts` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
+
+conflictsWithLast :: (LocalReg,CmmExpr) -> CmmNode O C -> Bool
+(r, _) `conflictsWithLast` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 01ebac6254..2e24dd7f82 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -1,67 +1,70 @@
 -----------------------------------------------------------------------------
 --
--- (c) The University of Glasgow 2004-2006
+-- (c) The University of Glasgow 2011
 --
 -- CmmLint: checking the correctness of Cmm statements and expressions
 --
 -----------------------------------------------------------------------------
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
+{-# LANGUAGE GADTs #-}
 module CmmLint (
-  cmmLint, cmmLintTop
+    cmmLint, cmmLintGraph
   ) where
 
+import Hoopl
+import Cmm
+import CmmUtils
+import PprCmm ()
 import BlockId
-import OldCmm
-import CLabel
+import FastString
 import Outputable
-import OldPprCmm()
 import Constants
-import FastString
-import Platform
 
 import Data.Maybe
 
+-- Things to check:
+--     - invariant on CmmBlock in CmmExpr (see comment there)
+--     - check for branches to blocks that don't exist
+--     - check types
+
 -- -----------------------------------------------------------------------------
 -- Exported entry points:
 
 cmmLint :: (Outputable d, Outputable h)
-        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+        => GenCmmGroup d h CmmGraph -> Maybe SDoc
+cmmLint tops = runCmmLint (mapM_ lintCmmDecl) tops
 
-cmmLintTop :: (Outputable d, Outputable h)
-           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+cmmLintGraph :: CmmGraph -> Maybe SDoc
+cmmLintGraph g = runCmmLint lintCmmGraph g
 
-runCmmLint :: Outputable a
-           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
-runCmmLint _ l p =
+runCmmLint :: Outputable a => (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint l p =
    case unCL (l p) of
-   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
-                           nest 2 err,
-                           ptext $ sLit ("Program was:"),
-                           nest 2 (ppr p)])
-   Right _  -> Nothing
-
-lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
-lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
-  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
-        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
-	in  mapM_ (lintCmmBlock platform labels) blocks
-
-lintCmmDecl _ (CmmData {})
+     Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                             nest 2 err,
+                             ptext $ sLit ("Program was:"),
+                             nest 2 (ppr p)])
+     Right _  -> Nothing
+
+lintCmmDecl :: GenCmmDecl h i CmmGraph -> CmmLint ()
+lintCmmDecl (CmmProc _ lbl g)
+  = addLintInfo (text "in proc " <> ppr lbl) $ lintCmmGraph g
+lintCmmDecl (CmmData {})
   = return ()
 
-lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
-lintCmmBlock platform labels (BasicBlock id stmts)
-  = addLintInfo (text "in basic block " <> ppr id) $
-	mapM_ (lintCmmStmt platform labels) stmts
+
+lintCmmGraph :: CmmGraph -> CmmLint ()
+lintCmmGraph g = mapM_ (lintCmmBlock labels) blocks
+  where
+       blocks = toBlockList g
+       labels = setFromList (map entryLabel blocks)
+
+
+lintCmmBlock :: BlockSet -> CmmBlock -> CmmLint ()
+lintCmmBlock labels block
+  = addLintInfo (text "in basic block " <> ppr (entryLabel block)) $ do
+        let (_, middle, last) = blockSplit block
+        mapM_ lintCmmMiddle (blockToList middle)
+        lintCmmLast labels last
 
 -- -----------------------------------------------------------------------------
 -- lintCmmExpr
@@ -69,24 +72,24 @@ lintCmmBlock platform labels (BasicBlock id stmts)
 -- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
 -- byte/word mismatches.
 
-lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
-lintCmmExpr platform (CmmLoad expr rep) = do
-  _ <- lintCmmExpr platform expr
+lintCmmExpr :: CmmExpr -> CmmLint CmmType
+lintCmmExpr (CmmLoad expr rep) = do
+  _ <- lintCmmExpr expr
   -- Disabled, if we have the inlining phase before the lint phase,
   -- we can have funny offsets due to pointer tagging. -- EZY
   -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
   --   cmmCheckWordAddress expr
   return rep
-lintCmmExpr platform expr@(CmmMachOp op args) = do
-  tys <- mapM (lintCmmExpr platform) args
+lintCmmExpr expr@(CmmMachOp op args) = do
+  tys <- mapM lintCmmExpr args
   if map (typeWidth . cmmExprType) args == machOpArgReps op
-  	then cmmCheckMachOp op args tys
-	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
-lintCmmExpr platform (CmmRegOff reg offset)
-  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
-		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+        then cmmCheckMachOp op args tys
+        else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr (CmmRegOff reg offset)
+  = lintCmmExpr (CmmMachOp (MO_Add rep)
+                [CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
   where rep = typeWidth (cmmRegType reg)
-lintCmmExpr _ expr =
+lintCmmExpr expr =
   return (cmmExprType expr)
 
 -- Check for some common byte/word mismatches (eg. Sp + 1)
@@ -119,43 +122,61 @@ notNodeReg :: CmmExpr -> Bool
 notNodeReg (CmmReg reg) | reg == nodeReg = False
 notNodeReg _                             = True
 
-lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
-lintCmmStmt platform labels = lint
-    where lint (CmmNop) = return ()
-          lint (CmmComment {}) = return ()
-          lint stmt@(CmmAssign reg expr) = do
-            erep <- lintCmmExpr platform expr
-	    let reg_ty = cmmRegType reg
+lintCmmMiddle :: CmmNode O O -> CmmLint ()
+lintCmmMiddle node = case node of
+  CmmComment _ -> return ()
+
+  CmmAssign reg expr -> do
+            erep <- lintCmmExpr expr
+            let reg_ty = cmmRegType reg
             if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
                 then return ()
-                else cmmLintAssignErr stmt erep reg_ty
-          lint (CmmStore l r) = do
-            _ <- lintCmmExpr platform l
-            _ <- lintCmmExpr platform r
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+
+  CmmStore l r -> do
+            _ <- lintCmmExpr l
+            _ <- lintCmmExpr r
             return ()
-          lint (CmmCall target _res args _) =
-              do lintTarget platform labels target
-                 mapM_ (lintCmmExpr platform . hintlessCmm) args
-          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
-          lint (CmmSwitch e branches) = do
+
+  CmmUnsafeForeignCall target _formals actuals -> do
+            lintTarget target
+            mapM_ lintCmmExpr actuals
+
+
+lintCmmLast :: BlockSet -> CmmNode O C -> CmmLint ()
+lintCmmLast labels node = case node of
+  CmmBranch id -> checkTarget id
+
+  CmmCondBranch e t f -> do
+            mapM_ checkTarget [t,f]
+            _ <- lintCmmExpr e
+            checkCond e
+
+  CmmSwitch e branches -> do
             mapM_ checkTarget $ catMaybes branches
-            erep <- lintCmmExpr platform e
+            erep <- lintCmmExpr e
             if (erep `cmmEqType_ignoring_ptrhood` bWord)
               then return ()
-              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
-                               text " :: " <> ppr erep)
-          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
-          lint (CmmReturn) = return ()
-          lint (CmmBranch id) = checkTarget id
-          checkTarget id = if setMember id labels then return ()
-                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
-
-lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
-lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
-                                                return ()
-lintTarget _        _      (CmmPrim _ Nothing) = return ()
-lintTarget platform labels (CmmPrim _ (Just stmts))
-    = mapM_ (lintCmmStmt platform labels) stmts
+              else cmmLintErr (text "switch scrutinee is not a word: " <>
+                               ppr e <> text " :: " <> ppr erep)
+
+  CmmCall { cml_target = target, cml_cont = cont } -> do
+          _ <- lintCmmExpr target
+          maybe (return ()) checkTarget cont
+
+  CmmForeignCall tgt _ args succ _ _ -> do
+          lintTarget tgt
+          mapM_ lintCmmExpr args
+          checkTarget succ
+ where
+  checkTarget id
+     | setMember id labels = return ()
+     | otherwise = cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+
+lintTarget :: ForeignTarget -> CmmLint ()
+lintTarget (ForeignTarget e _) = lintCmmExpr e >> return ()
+lintTarget (PrimTarget {})     = return ()
 
 
 checkCond :: CmmExpr -> CmmLint ()
@@ -163,7 +184,7 @@ checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
 checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
 checkCond expr
     = cmmLintErr (hang (text "expression is not a conditional:") 2
-                       (ppr expr))
+                         (ppr expr))
 
 -- -----------------------------------------------------------------------------
 -- CmmLint monad
@@ -173,37 +194,36 @@ checkCond expr
 newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
 
 instance Monad CmmLint where
-  CmmLint m >>= k = CmmLint $ case m of 
-				Left e -> Left e
-				Right a -> unCL (k a)
+  CmmLint m >>= k = CmmLint $ case m of
+                                Left e -> Left e
+                                Right a -> unCL (k a)
   return a = CmmLint (Right a)
 
 cmmLintErr :: SDoc -> CmmLint a
 cmmLintErr msg = CmmLint (Left msg)
 
 addLintInfo :: SDoc -> CmmLint a -> CmmLint a
-addLintInfo info thing = CmmLint $ 
+addLintInfo info thing = CmmLint $
    case unCL thing of
-	Left err -> Left (hang info 2 err)
-	Right a  -> Right a
+        Left err -> Left (hang info 2 err)
+        Right a  -> Right a
 
 cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
 cmmLintMachOpErr expr argsRep opExpectsRep
-     = cmmLintErr (text "in MachOp application: " $$ 
-					nest 2 (ppr expr) $$
-				        (text "op is expecting: " <+> ppr opExpectsRep) $$
-					(text "arguments provide: " <+> ppr argsRep))
+     = cmmLintErr (text "in MachOp application: " $$
+                   nest 2 (ppr  expr) $$
+                      (text "op is expecting: " <+> ppr opExpectsRep) $$
+                      (text "arguments provide: " <+> ppr argsRep))
 
-cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr :: CmmNode e x -> CmmType -> CmmType -> CmmLint a
 cmmLintAssignErr stmt e_ty r_ty
-  = cmmLintErr (text "in assignment: " $$ 
-		nest 2 (vcat [ppr stmt, 
-			      text "Reg ty:" <+> ppr r_ty,
-			      text "Rhs ty:" <+> ppr e_ty]))
-			 
-					
+  = cmmLintErr (text "in assignment: " $$
+                nest 2 (vcat [ppr stmt,
+                              text "Reg ty:" <+> ppr r_ty,
+                              text "Rhs ty:" <+> ppr e_ty]))
+
 
 cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
 cmmLintDubiousWordOffset expr
    = cmmLintErr (text "offset is not a multiple of words: " $$
-			nest 2 (ppr expr))
+                 nest 2 (ppr expr))
diff --git a/compiler/cmm/CmmLive.hs b/compiler/cmm/CmmLive.hs
index 9a5bb2d5ae..f0163fefc4 100644
--- a/compiler/cmm/CmmLive.hs
+++ b/compiler/cmm/CmmLive.hs
@@ -11,17 +11,15 @@ module CmmLive
     )
 where
 
+import UniqSupply
 import BlockId
 import Cmm
 import CmmUtils
-import Control.Monad
-import OptimizationFuel
 import PprCmmExpr ()
 
-import Compiler.Hoopl
+import Hoopl
 import Maybes
 import Outputable
-import UniqSet
 
 -----------------------------------------------------------------------------
 -- Calculating what variables are live on entry to a basic block
@@ -33,8 +31,10 @@ type CmmLive = RegSet
 -- | The dataflow lattice
 liveLattice :: DataflowLattice CmmLive
 liveLattice = DataflowLattice "live LocalReg's" emptyRegSet add
-    where add _ (OldFact old) (NewFact new) = case unionUniqSets old new of
-            join -> (changeIf $ sizeUniqSet join > sizeUniqSet old, join)
+    where add _ (OldFact old) (NewFact new) =
+               (changeIf $ sizeRegSet join > sizeRegSet old, join)
+              where !join = plusRegSet old new
+
 
 -- | A mapping from block labels to the variables live on entry
 type BlockEntryLiveness = BlockEnv CmmLive
@@ -43,16 +43,17 @@ type BlockEntryLiveness = BlockEnv CmmLive
 -- | Calculated liveness info for a CmmGraph
 -----------------------------------------------------------------------------
 
-cmmLiveness :: CmmGraph -> FuelUniqSM BlockEntryLiveness
+cmmLiveness :: CmmGraph -> BlockEntryLiveness
 cmmLiveness graph =
-  liftM check $ liftM snd $ dataflowPassBwd graph [] $ analBwd liveLattice xferLive
+  check $ dataflowAnalBwd graph [] $ analBwd liveLattice xferLive
   where entry = g_entry graph
-        check facts = noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts
+        check facts = noLiveOnEntry entry
+                        (expectJust "check" $ mapLookup entry facts) facts
 
 -- | On entry to the procedure, there had better not be any LocalReg's live-in.
 noLiveOnEntry :: BlockId -> CmmLive -> a -> a
 noLiveOnEntry bid in_fact x =
-  if isEmptyUniqSet in_fact then x
+  if nullRegSet in_fact then x
   else pprPanic "LocalReg's live-in to graph" (ppr bid <+> ppr in_fact)
 
 -- | The transfer equations use the traditional 'gen' and 'kill'
@@ -60,42 +61,42 @@ noLiveOnEntry bid in_fact x =
 gen  :: UserOfLocalRegs a    => a -> RegSet -> RegSet
 gen  a live = foldRegsUsed extendRegSet      live a
 kill :: DefinerOfLocalRegs a => a -> RegSet -> RegSet
-kill a live = foldRegsDefd delOneFromUniqSet live a
+kill a live = foldRegsDefd deleteFromRegSet live a
 
-gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a) => a -> CmmLive -> CmmLive
+gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a)
+          => a -> CmmLive -> CmmLive
 gen_kill a = gen a . kill a
 
 -- | The transfer function
--- EZY: Bits of this analysis are duplicated in CmmSpillReload, though
--- it's not really easy to efficiently reuse all of this.  Keep in mind
--- if you need to update this analysis.
 xferLive :: BwdTransfer CmmNode CmmLive
 xferLive = mkBTransfer3 fst mid lst
   where fst _ f = f
         mid :: CmmNode O O -> CmmLive -> CmmLive
         mid n f = gen_kill n f
         lst :: CmmNode O C -> FactBase CmmLive -> CmmLive
-        -- slightly inefficient: kill is unnecessary for emptyRegSet
-        lst n f = gen_kill n
-                $ case n of CmmCall{}        -> emptyRegSet
-                            CmmForeignCall{} -> emptyRegSet
-                            _                -> joinOutFacts liveLattice n f
+        lst n f = gen_kill n $ joinOutFacts liveLattice n f
 
 -----------------------------------------------------------------------------
 -- Removing assignments to dead variables
 -----------------------------------------------------------------------------
 
-removeDeadAssignments :: CmmGraph -> FuelUniqSM CmmGraph
+removeDeadAssignments :: CmmGraph -> UniqSM (CmmGraph, BlockEnv CmmLive)
 removeDeadAssignments g =
-   liftM fst $ dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
-   where rewrites = deepBwdRw3 nothing middle nothing
-         -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-         -- but GHC panics while compiling, see bug #4045.
+   dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
+   where rewrites = mkBRewrite3 nothing middle nothing
+         -- SDM: no need for deepBwdRw here, we only rewrite to empty
+         -- Beware: deepBwdRw with one polymorphic function seems more
+         -- reasonable here, but GHC panics while compiling, see bug
+         -- #4045.
          middle :: CmmNode O O -> Fact O CmmLive -> CmmReplGraph O O
-         middle (CmmAssign (CmmLocal reg') _) live | not (reg' `elemRegSet` live) = return $ Just emptyGraph
+         middle (CmmAssign (CmmLocal reg') _) live
+                 | not (reg' `elemRegSet` live)
+                 = return $ Just emptyGraph
          -- XXX maybe this should be somewhere else...
-         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs = return $ Just emptyGraph
-         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs = return $ Just emptyGraph
+         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs
+                 = return $ Just emptyGraph
+         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs
+                 = return $ Just emptyGraph
          middle _ _ = return Nothing
 
          nothing :: CmmNode e x -> Fact x CmmLive -> CmmReplGraph e x
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index 4844af9d9a..0a5f5170f0 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -18,7 +18,7 @@ module CmmNode (
      CmmNode(..), ForeignHint(..), CmmFormal, CmmActual,
      UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..),
      mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
-     mapExpM, mapExpDeepM, wrapRecExpM
+     mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors
   ) where
 
 import CmmExpr
@@ -35,15 +35,17 @@ import Prelude hiding (succ)
 ------------------------
 -- CmmNode
 
+#define ULabel {-# UNPACK #-} !Label
+
 data CmmNode e x where
-  CmmEntry :: Label -> CmmNode C O
+  CmmEntry :: ULabel -> CmmNode C O
 
   CmmComment :: FastString -> CmmNode O O
 
-  CmmAssign :: CmmReg -> CmmExpr -> CmmNode O O
+  CmmAssign :: !CmmReg -> !CmmExpr -> CmmNode O O
     -- Assign to register
 
-  CmmStore :: CmmExpr -> CmmExpr -> CmmNode O O
+  CmmStore :: !CmmExpr -> !CmmExpr -> CmmNode O O
     -- Assign to memory location.  Size is
     -- given by cmmExprType of the rhs.
 
@@ -60,11 +62,12 @@ data CmmNode e x where
       --            bug for what can be put in arguments, see
       --            Note [Register Parameter Passing]
 
-  CmmBranch :: Label -> CmmNode O C  -- Goto another block in the same procedure
+  CmmBranch :: ULabel -> CmmNode O C
+                                   -- Goto another block in the same procedure
 
   CmmCondBranch :: {                 -- conditional branch
       cml_pred :: CmmExpr,
-      cml_true, cml_false :: Label
+      cml_true, cml_false :: ULabel
   } -> CmmNode O C
 
   CmmSwitch :: CmmExpr -> [Maybe Label] -> CmmNode O C -- Table branch
@@ -78,15 +81,20 @@ data CmmNode e x where
 
       cml_cont :: Maybe Label,
           -- Label of continuation (Nothing for return or tail call)
-
--- ToDO: add this:
---       cml_args_regs :: [GlobalReg],
--- It says which GlobalRegs are live for the parameters at the
--- moment of the call.  Later stages can use this to give liveness
--- everywhere, which in turn guides register allocation.
--- It is the companion of cml_args; cml_args says which stack words
--- hold parameters, while cml_arg_regs says which global regs hold parameters.
--- But do note [Register parameter passing]
+          --
+          -- Note [Continuation BlockId]: these BlockIds are called
+          -- Continuation BlockIds, and are the only BlockIds that can
+          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
+          -- (CmmStackSlot (Young b) _).
+
+      cml_args_regs :: [GlobalReg],
+          -- The argument GlobalRegs (Rx, Fx, Dx, Lx) that are passed
+          -- to the call.  This is essential information for the
+          -- native code generator's register allocator; without
+          -- knowing which GlobalRegs are live it has to assume that
+          -- they are all live.  This list should only include
+          -- GlobalRegs that are mapped to real machine registers on
+          -- the target platform.
 
       cml_args :: ByteOff,
           -- Byte offset, from the *old* end of the Area associated with
@@ -117,7 +125,7 @@ data CmmNode e x where
       tgt   :: ForeignTarget,   -- call target and convention
       res   :: [CmmFormal],     -- zero or more results
       args  :: [CmmActual],     -- zero or more arguments; see Note [Register parameter passing]
-      succ  :: Label,           -- Label of continuation
+      succ  :: ULabel,          -- Label of continuation
       updfr :: UpdFrameOffset,  -- where the update frame is (for building infotable)
       intrbl:: Bool             -- whether or not the call is interruptible
   } -> CmmNode O C
@@ -181,7 +189,7 @@ instance Eq (CmmNode e x) where
   (CmmBranch a)                == (CmmBranch a')                  = a==a'
   (CmmCondBranch a b c)        == (CmmCondBranch a' b' c')        = a==a' && b==b' && c==c'
   (CmmSwitch a b)              == (CmmSwitch a' b')               = a==a' && b==b'
-  (CmmCall a b c d e)          == (CmmCall a' b' c' d' e')        = a==a' && b==b' && c==c' && d==d' && e==e'
+  (CmmCall a b c d e f)          == (CmmCall a' b' c' d' e' f')   = a==a' && b==b' && c==c' && d==d' && e==e' && f==f'
   (CmmForeignCall a b c d e f) == (CmmForeignCall a' b' c' d' e' f') = a==a' && b==b' && c==c' && d==d' && e==e' && f==f'
   _                            == _                               = False
 
@@ -198,10 +206,6 @@ instance NonLocal CmmNode where
   successors (CmmForeignCall {succ=l}) = [l]
 
 
-instance HooplNode CmmNode where
-  mkBranchNode label = CmmBranch label
-  mkLabelNode label  = CmmEntry label
-
 --------------------------------------------------
 -- Various helper types
 
@@ -218,14 +222,6 @@ data Convention
   | GC               -- Entry to the garbage collector: uses the node reg!
   | PrimOpCall       -- Calling prim ops
   | PrimOpReturn     -- Returning from prim ops
-  | Foreign          -- Foreign call/return
-        ForeignConvention
-  | Private
-        -- Used for control transfers within a (pre-CPS) procedure All
-        -- jump sites known, never pushed on the stack (hence no SRT)
-        -- You can choose whatever calling convention you please
-        -- (provided you make sure all the call sites agree)!
-        -- This data type eventually to be extended to record the convention.
   deriving( Eq )
 
 data ForeignConvention
@@ -283,37 +279,6 @@ instance DefinerOfLocalRegs (CmmNode e x) where
           fold f z n = foldRegsDefd f z n
 
 
-instance UserOfSlots (CmmNode e x) where
-  foldSlotsUsed f z n = case n of
-    CmmAssign _ expr -> fold f z expr
-    CmmStore addr rval -> fold f (fold f z addr) rval
-    CmmUnsafeForeignCall _ _ args -> fold f z args
-    CmmCondBranch expr _ _ -> fold f z expr
-    CmmSwitch expr _ -> fold f z expr
-    CmmCall {cml_target=tgt} -> fold f z tgt
-    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
-    _ -> z
-    where fold :: forall a b.
-                       UserOfSlots a =>
-                       (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsUsed f z n
-
-instance UserOfSlots ForeignTarget where
-  foldSlotsUsed  f z (ForeignTarget e _) = foldSlotsUsed f z e
-  foldSlotsUsed _f z (PrimTarget _)      = z
-
-instance DefinerOfSlots (CmmNode e x) where
-  foldSlotsDefd f z n = case n of
-    CmmStore (CmmStackSlot a i) expr -> f z (a, i, widthInBytes $ typeWidth $ cmmExprType expr)
-    CmmForeignCall {res=res} -> fold f z $ map foreign_call_slot res
-    _ -> z
-    where
-          fold :: forall a b.
-                  DefinerOfSlots a =>
-                  (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsDefd f z n
-          foreign_call_slot r = case widthInBytes $ typeWidth $ localRegType r of w -> (RegSlot r, w, w)
-
 -----------------------------------
 -- mapping Expr in CmmNode
 
@@ -336,7 +301,7 @@ mapExp f   (CmmUnsafeForeignCall tgt fs as)      = CmmUnsafeForeignCall (mapFore
 mapExp _ l@(CmmBranch _)                         = l
 mapExp f   (CmmCondBranch e ti fi)               = CmmCondBranch (f e) ti fi
 mapExp f   (CmmSwitch e tbl)                     = CmmSwitch (f e) tbl
-mapExp f   (CmmCall tgt mb_id o i s)             = CmmCall (f tgt) mb_id o i s
+mapExp f   n@CmmCall {cml_target=tgt}            = n{cml_target = f tgt}
 mapExp f   (CmmForeignCall tgt fs as succ updfr intrbl) = CmmForeignCall (mapForeignTarget f tgt) fs (map f as) succ updfr intrbl
 
 mapExpDeep :: (CmmExpr -> CmmExpr) -> CmmNode e x -> CmmNode e x
@@ -362,7 +327,7 @@ mapExpM f (CmmStore addr e)         = (\[addr', e'] -> CmmStore addr' e') `fmap`
 mapExpM _ (CmmBranch _)             = Nothing
 mapExpM f (CmmCondBranch e ti fi)   = (\x -> CmmCondBranch x ti fi) `fmap` f e
 mapExpM f (CmmSwitch e tbl)         = (\x -> CmmSwitch x tbl)       `fmap` f e
-mapExpM f (CmmCall tgt mb_id o i s) = (\x -> CmmCall x mb_id o i s) `fmap` f tgt
+mapExpM f (CmmCall tgt mb_id r o i s) = (\x -> CmmCall x mb_id r o i s) `fmap` f tgt
 mapExpM f (CmmUnsafeForeignCall tgt fs as)
     = case mapForeignTargetM f tgt of
         Just tgt' -> Just (CmmUnsafeForeignCall tgt' fs (mapListJ f as))
@@ -416,4 +381,20 @@ foldExp f (CmmCall {cml_target=tgt}) z            = f tgt z
 foldExp f (CmmForeignCall {tgt=tgt, args=args}) z = foldr f (foldExpForeignTarget f tgt z) args
 
 foldExpDeep :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
-foldExpDeep f = foldExp $ wrapRecExpf f
+foldExpDeep f = foldExp go
+  where -- go :: CmmExpr -> z -> z
+        go e@(CmmMachOp _ es) z = gos es $! f e z
+        go e@(CmmLoad addr _) z = go addr $! f e z
+        go e                  z = f e z
+
+        gos [] z = z
+        gos (e:es) z = gos es $! f e z
+
+-- -----------------------------------------------------------------------------
+
+mapSuccessors :: (Label -> Label) -> CmmNode O C -> CmmNode O C
+mapSuccessors f (CmmBranch bid)        = CmmBranch (f bid)
+mapSuccessors f (CmmCondBranch p y n)  = CmmCondBranch p (f y) (f n)
+mapSuccessors f (CmmSwitch e arms)     = CmmSwitch e (map (fmap f) arms)
+mapSuccessors _ n = n
+
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 8cc18fc1ca..8ff04cfa7b 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -145,8 +145,7 @@ To inline _smi:
 -}
 
 countUses :: UserOfLocalRegs a => a -> UniqFM Int
-countUses a = foldRegsUsed (\m r -> addToUFM m r (count m r + 1)) emptyUFM a
-  where count m r = lookupWithDefaultUFM m (0::Int) r
+countUses a = foldRegsUsed (\m r -> addToUFM_C (+) m r 1) emptyUFM a
 
 cmmMiniInline :: DynFlags -> [CmmBasicBlock] -> [CmmBasicBlock]
 cmmMiniInline dflags blocks = map do_inline blocks
@@ -157,25 +156,16 @@ cmmMiniInlineStmts :: DynFlags -> UniqFM Int -> [CmmStmt] -> [CmmStmt]
 cmmMiniInlineStmts _      _    [] = []
 cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr) : stmts)
         -- not used: just discard this assignment
-  | Nothing <- lookupUFM uses u
+  | 0 <- lookupWithDefaultUFM uses 0 u
   = cmmMiniInlineStmts dflags uses stmts
 
-        -- used (literal): try to inline at all the use sites
-  | Just n <- lookupUFM uses u, isLit expr
-  =
-     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u expr stmts of
-         (m, stmts')
-             | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
-             | otherwise ->
-                 stmt : cmmMiniInlineStmts dflags (adjustUFM (\x -> x - m) uses u) stmts'
-
-        -- used (foldable to literal): try to inline at all the use sites
+        -- used (foldable to small thing): try to inline at all the use sites
   | Just n <- lookupUFM uses u,
-    e@(CmmLit _) <- wrapRecExp foldExp expr
+    e <- wrapRecExp foldExp expr,
+    isTiny e
   =
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u e stmts of
+     case lookForInlineMany u e stmts of
          (m, stmts')
              | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
              | otherwise ->
@@ -188,6 +178,11 @@ cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr)
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
      cmmMiniInlineStmts dflags uses stmts'
  where
+  isTiny (CmmLit _) = True
+  isTiny (CmmReg (CmmGlobal _)) = True
+         -- not CmmLocal: that might invalidate the usage analysis results
+  isTiny _ = False
+
   platform = targetPlatform dflags
   foldExp (CmmMachOp op args) = cmmMachOpFold platform op args
   foldExp e = e
@@ -201,26 +196,28 @@ cmmMiniInlineStmts platform uses (stmt:stmts)
 -- register, and a list of statements.  Inlines the expression at all
 -- use sites of the register.  Returns the number of substituations
 -- made and the, possibly modified, list of statements.
-lookForInlineLit :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
-lookForInlineLit _ _ [] = (0, [])
-lookForInlineLit u expr stmts@(stmt : rest)
-  | Just n <- lookupUFM (countUses stmt) u
-  = case lookForInlineLit u expr rest of
-      (m, stmts) -> let z = n + m
-                    in z `seq` (z, inlineStmt u expr stmt : stmts)
-
-  | ok_to_skip
-  = case lookForInlineLit u expr rest of
+lookForInlineMany :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany u expr stmts = lookForInlineMany' u expr regset stmts
+    where regset = foldRegsUsed extendRegSet emptyRegSet expr
+
+lookForInlineMany' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany' _ _ _ [] = (0, [])
+lookForInlineMany' u expr regset stmts@(stmt : rest)
+  | Just n <- lookupUFM (countUses stmt) u, okToInline expr stmt
+  = let stmt' = inlineStmt u expr stmt in
+    if okToSkip stmt' u expr regset
+       then case lookForInlineMany' u expr regset rest of
+                       (m, stmts) -> let z = n + m
+                                     in z `seq` (z, stmt' : stmts)
+       else (n, stmt' : rest)
+
+  | okToSkip stmt u expr regset
+  = case lookForInlineMany' u expr regset rest of
       (n, stmts) -> (n, stmt : stmts)
 
   | otherwise
   = (0, stmts)
-  where
-    -- We skip over assignments to registers, unless the register
-    -- being assigned to is the one we're inlining.
-    ok_to_skip = case stmt of
-        CmmAssign (CmmLocal (LocalReg u' _)) _ | u' == u -> False
-        _other -> True
+
 
 lookForInline :: Unique -> CmmExpr -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline u expr stmts = lookForInline' u expr regset stmts
@@ -229,10 +226,10 @@ lookForInline u expr stmts = lookForInline' u expr regset stmts
 lookForInline' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline' _ _    _      [] = panic "lookForInline' []"
 lookForInline' u expr regset (stmt : rest)
-  | Just 1 <- lookupUFM (countUses stmt) u, ok_to_inline
+  | Just 1 <- lookupUFM (countUses stmt) u, okToInline expr stmt
   = Just (inlineStmt u expr stmt : rest)
 
-  | ok_to_skip
+  | okToSkip stmt u expr regset
   = case lookForInline' u expr regset rest of
            Nothing    -> Nothing
            Just stmts -> Just (stmt:stmts)
@@ -240,31 +237,37 @@ lookForInline' u expr regset (stmt : rest)
   | otherwise 
   = Nothing
 
-  where
-        -- we don't inline into CmmCall if the expression refers to global
-        -- registers.  This is a HACK to avoid global registers clashing with
-        -- C argument-passing registers, really the back-end ought to be able
-        -- to handle it properly, but currently neither PprC nor the NCG can
-        -- do it.  See also CgForeignCall:load_args_into_temps.
-    ok_to_inline = case stmt of
-                     CmmCall{} -> hasNoGlobalRegs expr
-                     _ -> True
-
-   -- Expressions aren't side-effecting.  Temporaries may or may not
-   -- be single-assignment depending on the source (the old code
-   -- generator creates single-assignment code, but hand-written Cmm
-   -- and Cmm from the new code generator is not single-assignment.)
-   -- So we do an extra check to make sure that the register being
-   -- changed is not one we were relying on.  I don't know how much of a
-   -- performance hit this is (we have to create a regset for every
-   -- instruction.) -- EZY
-    ok_to_skip = case stmt of
-                 CmmNop -> True
-                 CmmComment{} -> True
-                 CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
-                 CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
-                 _other -> False
 
+-- we don't inline into CmmCall if the expression refers to global
+-- registers.  This is a HACK to avoid global registers clashing with
+-- C argument-passing registers, really the back-end ought to be able
+-- to handle it properly, but currently neither PprC nor the NCG can
+-- do it.  See also CgForeignCall:load_args_into_temps.
+okToInline :: CmmExpr -> CmmStmt -> Bool
+okToInline expr CmmCall{} = hasNoGlobalRegs expr
+okToInline _ _ = True
+
+-- Expressions aren't side-effecting.  Temporaries may or may not
+-- be single-assignment depending on the source (the old code
+-- generator creates single-assignment code, but hand-written Cmm
+-- and Cmm from the new code generator is not single-assignment.)
+-- So we do an extra check to make sure that the register being
+-- changed is not one we were relying on.  I don't know how much of a
+-- performance hit this is (we have to create a regset for every
+-- instruction.) -- EZY
+okToSkip :: CmmStmt -> Unique -> CmmExpr -> RegSet -> Bool
+okToSkip stmt u expr regset
+   = case stmt of
+         CmmNop -> True
+         CmmComment{} -> True
+         CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
+         CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
+         CmmStore _ _ -> not_a_load expr
+         _other -> False
+  where
+    not_a_load (CmmMachOp _ args) = all not_a_load args
+    not_a_load (CmmLoad _ _) = False
+    not_a_load _ = True
 
 inlineStmt :: Unique -> CmmExpr -> CmmStmt -> CmmStmt
 inlineStmt u a (CmmAssign r e) = CmmAssign r (inlineExpr u a e)
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 075ed22ea9..f46d49e022 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -230,35 +230,31 @@ lits	:: { [ExtFCode CmmExpr] }
 
 cmmproc :: { ExtCode }
 -- TODO: add real SRT/info tables to parsed Cmm
-	: info maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
-		{ do ((entry_ret_label, info, live, formals, gc_block, frame), stmts) <-
+        : info maybe_formals_without_hints '{' body '}'
+                { do ((entry_ret_label, info, live, formals), stmts) <-
 		       getCgStmtsEC' $ loopDecls $ do {
 		         (entry_ret_label, info, live) <- $1;
 		         formals <- sequence $2;
-		         gc_block <- $3;
-		         frame <- $4;
-		         $6;
-		         return (entry_ret_label, info, live, formals, gc_block, frame) }
+                         $4;
+                         return (entry_ret_label, info, live, formals) }
 		     blks <- code (cgStmtsToBlocks stmts)
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo gc_block frame info) formals blks) }
+                     code (emitInfoTableAndCode entry_ret_label info formals blks) }
 
 	| info maybe_formals_without_hints ';'
 		{ do (entry_ret_label, info, live) <- $1;
 		     formals <- sequence $2;
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo Nothing Nothing info) formals []) }
+                     code (emitInfoTableAndCode entry_ret_label info formals []) }
 
-	| NAME maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
+        | NAME maybe_formals_without_hints '{' body '}'
 		{% withThisPackage $ \pkg ->
 		   do	newFunctionName $1 pkg
-		   	((formals, gc_block, frame), stmts) <-
+                        (formals, stmts) <-
 			 	getCgStmtsEC' $ loopDecls $ do {
 		          		formals <- sequence $2;
-		          		gc_block <- $3;
-			  		frame <- $4;
-		          		$6;
-		          		return (formals, gc_block, frame) }
+                                        $4;
+                                        return formals }
 			blks <- code (cgStmtsToBlocks stmts)
-			code (emitProc (CmmInfo gc_block frame CmmNonInfoTable) (mkCmmCodeLabel pkg $1) formals blks) }
+                        code (emitProc CmmNonInfoTable (mkCmmCodeLabel pkg $1) formals blks) }
 
 info	:: { ExtFCode (CLabel, CmmInfoTable, [Maybe LocalReg]) }
 	: 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
@@ -599,18 +595,7 @@ formals_without_hints :: { [ExtFCode LocalReg] }
 formal_without_hint :: { ExtFCode LocalReg }
 	: type NAME		{ newLocal $1 $2 }
 
-maybe_frame :: { ExtFCode (Maybe UpdateFrame) }
-	: {- empty -}			{ return Nothing }
-	| 'jump' expr '(' exprs0 ')'	{ do { target <- $2;
-					       args <- sequence $4;
-					       return $ Just (UpdateFrame target args) } }
-
-maybe_gc_block :: { ExtFCode (Maybe BlockId) }
-	: {- empty -}			{ return Nothing }
-	| 'goto' NAME
-		{ do l <- lookupLabel $2; return (Just l) }
-
-type	:: { CmmType }
+type    :: { CmmType }
 	: 'bits8'		{ b8 }
 	| typenot8		{ $1 }
 
@@ -1073,7 +1058,8 @@ parseCmmFile dflags filename = do
         let msg = mkPlainErrMsg dflags span err
         return ((emptyBag, unitBag msg), Nothing)
     POk pst code -> do
-        cmm <- initC dflags no_module (getCmm (unEC code initEnv [] >> return ()))
+        st <- initC
+        let (cmm,_) = runC dflags no_module st (getCmm (unEC code initEnv [] >> return ()))
         let ms = getMessages pst
         if (errorsFound dflags ms)
          then return (ms, Nothing)
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index 7361bbf385..bb8d5b2f22 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -11,25 +11,23 @@ module CmmPipeline (
 
 import CLabel
 import Cmm
-import CmmLive
+import CmmLint
 import CmmBuildInfoTables
 import CmmCommonBlockElim
 import CmmProcPoint
-import CmmSpillReload
-import CmmRewriteAssignments
-import CmmStackLayout
 import CmmContFlowOpt
-import OptimizationFuel
+import CmmLayoutStack
 
+import UniqSupply
 import DynFlags
 import ErrUtils
 import HscTypes
 import Data.Maybe
 import Control.Monad
-import Data.Map (Map)
-import qualified Data.Map as Map
 import Outputable
-import StaticFlags
+
+import qualified Data.Set as Set
+import Data.Map (Map)
 
 -----------------------------------------------------------------------------
 -- | Top level driver for C-- pipeline
@@ -53,32 +51,28 @@ import StaticFlags
 --    we actually need to do the initial pass.
 cmmPipeline  :: HscEnv -- Compilation env including
                        -- dynamic flags: -dcmm-lint -ddump-cps-cmm
-             -> (TopSRT, [CmmGroup])    -- SRT table and accumulating list of compiled procs
+             -> TopSRT     -- SRT table and accumulating list of compiled procs
              -> CmmGroup             -- Input C-- with Procedures
-             -> IO (TopSRT, [CmmGroup]) -- Output CPS transformed C--
-cmmPipeline hsc_env (topSRT, rst) prog =
+             -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
+cmmPipeline hsc_env topSRT prog =
   do let dflags = hsc_dflags hsc_env
      --
      showPass dflags "CPSZ"
 
-     let tops = runCmmContFlowOpts prog
-     (cafEnvs, tops) <- liftM unzip $ mapM (cpsTop hsc_env) tops
+     (cafEnvs, tops) <- {-# SCC "tops" #-} liftM unzip $ mapM (cpsTop hsc_env) prog
      -- tops :: [[(CmmDecl,CAFSet]]  (one list per group)
 
-     let topCAFEnv = mkTopCAFInfo (concat cafEnvs)
+     let topCAFEnv = {-# SCC "topCAFEnv" #-} mkTopCAFInfo (concat cafEnvs)
 
      -- folding over the groups
-     (topSRT, tops) <- foldM (toTops hsc_env topCAFEnv) (topSRT, []) tops
+     (topSRT, tops) <- {-# SCC "toTops" #-} foldM (toTops topCAFEnv) (topSRT, []) tops
 
      let cmms :: CmmGroup
          cmms = reverse (concat tops)
 
      dumpIfSet_dyn dflags Opt_D_dump_cps_cmm "Post CPS Cmm" (ppr cmms)
 
-     -- SRT is not affected by control flow optimization pass
-     let prog' = runCmmContFlowOpts cmms
-
-     return (topSRT, prog' : rst)
+     return (topSRT, cmms)
 
 {- [Note global fuel]
 ~~~~~~~~~~~~~~~~~~~~~
@@ -92,105 +86,110 @@ global to one compiler session.
 -- -ddump-cmmz
 
 cpsTop :: HscEnv -> CmmDecl -> IO ([(CLabel, CAFSet)], [(CAFSet, CmmDecl)])
-cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
+cpsTop _ p@(CmmData {}) = return ([], [(Set.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
-       -- Why bother doing these early: dualLivenessWithInsertion,
-       -- insertLateReloads, rewriteAssignments?
+       ----------- Control-flow optimisations ---------------
+       g <- {-# SCC "cmmCfgOpts(1)" #-} return $ cmmCfgOpts g
+       dump Opt_D_dump_cmmz_cfg "Post control-flow optimsations" g
 
        ----------- Eliminate common blocks -------------------
-       g <- return $ elimCommonBlocks g
+       g <- {-# SCC "elimCommonBlocks" #-} return $ elimCommonBlocks g
        dump Opt_D_dump_cmmz_cbe "Post common block elimination" g
-       -- Any work storing block Labels must be performed _after_ elimCommonBlocks
+       -- Any work storing block Labels must be performed _after_
+       -- elimCommonBlocks
 
        ----------- Proc points -------------------
-       let callPPs = callProcPoints g
-       procPoints <- run $ minimalProcPointSet (targetPlatform dflags) callPPs g
-       g <- run $ addProcPointProtocols callPPs procPoints g
-       dump Opt_D_dump_cmmz_proc "Post Proc Points Added" g
-
-       ----------- Spills and reloads -------------------
-       g <- run $ dualLivenessWithInsertion procPoints g
-       dump Opt_D_dump_cmmz_spills "Post spills and reloads" g
-
-       ----------- Sink and inline assignments -------------------
-       g <- runOptimization $ rewriteAssignments platform g
-       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
-
-       ----------- Eliminate dead assignments -------------------
-       g <- runOptimization $ removeDeadAssignments g
-       dump Opt_D_dump_cmmz_dead "Post remove dead assignments" g
-
-       ----------- Zero dead stack slots (Debug only) ---------------
-       -- Debugging: stubbing slots on death can cause crashes early
-       g <- if opt_StubDeadValues
-                then run $ stubSlotsOnDeath g
-                else return g
-       dump Opt_D_dump_cmmz_stub "Post stub dead stack slots" g
-
-       --------------- Stack layout ----------------
-       slotEnv <- run $ liveSlotAnal g
-       let spEntryMap = getSpEntryMap entry_off g
-       mbpprTrace "live slot analysis results: " (ppr slotEnv) $ return ()
-       let areaMap = layout procPoints spEntryMap slotEnv entry_off g
-       mbpprTrace "areaMap" (ppr areaMap) $ return ()
-
-       ------------  Manifest the stack pointer --------
-       g  <- run $ manifestSP spEntryMap areaMap entry_off g
-       dump Opt_D_dump_cmmz_sp "Post manifestSP" g
-       -- UGH... manifestSP can require updates to the procPointMap.
-       -- We can probably do something quicker here for the update...
+       let callPPs = {-# SCC "callProcPoints" #-} callProcPoints g
+       procPoints <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
+                     minimalProcPointSet (targetPlatform dflags) callPPs g
+
+       ----------- Layout the stack and manifest Sp ---------------
+       -- (also does: removeDeadAssignments, and lowerSafeForeignCalls)
+       (g, stackmaps) <- {-# SCC "layoutStack" #-}
+                         runUniqSM $ cmmLayoutStack procPoints entry_off g
+       dump Opt_D_dump_cmmz_sp "Layout Stack" g
+
+--       g <- {-# SCC "sink" #-} runUniqSM $ cmmSink g
+--       dump Opt_D_dump_cmmz_rewrite "Sink assignments" g
+
+--       ----------- Sink and inline assignments -------------------
+--       g <- {-# SCC "rewriteAssignments" #-} runOptimization $
+--            rewriteAssignments platform g
+--       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
 
        ------------- Split into separate procedures ------------
-       procPointMap  <- run $ procPointAnalysis procPoints g
-       dump Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
-       gs <- run $ splitAtProcPoints l callPPs procPoints procPointMap
-                                       (CmmProc h l g)
-       mapM_ (dump Opt_D_dump_cmmz_split "Post splitting") gs
-
-       ------------- More CAFs and foreign calls ------------
-       cafEnv <- run $ cafAnal g
-       let localCAFs = catMaybes $ map (localCAFInfo cafEnv) gs
+       procPointMap  <- {-# SCC "procPointAnalysis" #-} runUniqSM $
+                        procPointAnalysis procPoints g
+       dumpWith dflags Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
+       gs <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
+             splitAtProcPoints l callPPs procPoints procPointMap (CmmProc h l g)
+       dumps Opt_D_dump_cmmz_split "Post splitting" gs
+
+       ------------- More CAFs ------------------------------
+       let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
+       let localCAFs = {-# SCC "localCAFs" #-} catMaybes $ map (localCAFInfo cafEnv) gs
        mbpprTrace "localCAFs" (ppr localCAFs) $ return ()
 
-       gs <- run $ mapM (lowerSafeForeignCalls areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_lower "Post lowerSafeForeignCalls") gs
-
        -- NO MORE GRAPH TRANSFORMATION AFTER HERE -- JUST MAKING INFOTABLES
-       gs <- return $ map (setInfoTableStackMap slotEnv areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_info "after setInfoTableStackMap") gs
-       gs <- return $ map (bundleCAFs cafEnv) gs
-       mapM_ (dump Opt_D_dump_cmmz_cafs "after bundleCAFs") gs
+       gs <- {-# SCC "setInfoTableStackMap" #-}
+             return $ map (setInfoTableStackMap stackmaps) gs
+       dumps Opt_D_dump_cmmz_info "after setInfoTableStackMap" gs
+
+       ----------- Control-flow optimisations ---------------
+       gs <- {-# SCC "cmmCfgOpts(2)" #-} return $ map cmmCfgOptsProc gs
+       dumps Opt_D_dump_cmmz_cfg "Post control-flow optimsations" gs
+
+       gs <- {-# SCC "bundleCAFs" #-} return $ map (bundleCAFs cafEnv) gs
+       dumps Opt_D_dump_cmmz_cafs "after bundleCAFs" gs
+
        return (localCAFs, gs)
 
               -- gs        :: [ (CAFSet, CmmDecl) ]
               -- localCAFs :: [ (CLabel, CAFSet) ] -- statics filtered out(?)
 
   where dflags = hsc_dflags hsc_env
-        platform = targetPlatform dflags
-        mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
-        dump f = dumpWith ppr f
-        dumpWith pprFun f txt g = do
-            -- ToDo: No easy way of say "dump all the cmmz, *and* split
-            -- them into files."  Also, -ddump-cmmz doesn't play nicely
-            -- with -ddump-to-file, since the headers get omitted.
-            dumpIfSet_dyn dflags f txt (pprFun g)
-            when (not (dopt f dflags)) $
-                dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (pprFun g)
-        -- Runs a required transformation/analysis
-        run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
-        -- Runs an optional transformation/analysis (and should
-        -- thus be subject to optimization fuel)
-        runOptimization = runFuelIO (hsc_OptFuel hsc_env)
+        mbpprTrace x y z | dopt Opt_D_dump_cmmz dflags = pprTrace x y z
+                         | otherwise = z
+        dump = dumpGraph dflags
+
+        dumps flag name
+           = mapM_ (dumpWith dflags flag name)
+
+runUniqSM :: UniqSM a -> IO a
+runUniqSM m = do
+  us <- mkSplitUniqSupply 'u'
+  return (initUs_ us m)
+
+
+dumpGraph :: DynFlags -> DynFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+  when (dopt Opt_DoCmmLinting dflags) $ do_lint g
+  dumpWith dflags flag name g
+ where
+  do_lint g = case cmmLintGraph g of
+                 Just err -> do { fatalErrorMsg dflags err
+                                ; ghcExit dflags 1
+                                }
+                 Nothing  -> return ()
+
+dumpWith :: Outputable a => DynFlags -> DynFlag -> String -> a -> IO ()
+dumpWith dflags flag txt g = do
+         -- ToDo: No easy way of say "dump all the cmmz, *and* split
+         -- them into files."  Also, -ddump-cmmz doesn't play nicely
+         -- with -ddump-to-file, since the headers get omitted.
+   dumpIfSet_dyn dflags flag txt (ppr g)
+   when (not (dopt flag dflags)) $
+      dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (ppr g)
 
 -- This probably belongs in CmmBuildInfoTables?
 -- We're just finishing the job here: once we know what CAFs are defined
 -- in non-static closures, we can build the SRTs.
-toTops :: HscEnv -> Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
-                 -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
-toTops hsc_env topCAFEnv (topSRT, tops) gs =
+toTops :: Map CLabel CAFSet -> (TopSRT, [[CmmDecl]])
+       -> [(CAFSet, CmmDecl)] -> IO (TopSRT, [[CmmDecl]])
+toTops topCAFEnv (topSRT, tops) gs =
   do let setSRT (topSRT, rst) g =
            do (topSRT, gs) <- setInfoTableSRT topCAFEnv topSRT g
               return (topSRT, gs : rst)
-     (topSRT, gs') <- runFuelIO (hsc_OptFuel hsc_env) $ foldM setSRT (topSRT, []) gs
+     (topSRT, gs') <- runUniqSM $ foldM setSRT (topSRT, []) gs
      return (topSRT, concat gs' : tops)
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index f50d850b3a..ebe40d9c9e 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -4,7 +4,7 @@
 module CmmProcPoint
     ( ProcPointSet, Status(..)
     , callProcPoints, minimalProcPointSet
-    , addProcPointProtocols, splitAtProcPoints, procPointAnalysis
+    , splitAtProcPoints, procPointAnalysis
     )
 where
 
@@ -13,22 +13,17 @@ import Prelude hiding (last, unzip, succ, zip)
 import BlockId
 import CLabel
 import Cmm
+import PprCmm ()
 import CmmUtils
-import CmmContFlowOpt
 import CmmInfo
-import CmmLive
-import Constants
 import Data.List (sortBy)
 import Maybes
-import MkGraph
 import Control.Monad
-import OptimizationFuel
 import Outputable
 import Platform
-import UniqSet
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import qualified Data.Map as Map
 
@@ -103,34 +98,50 @@ instance Outputable Status where
                     (hsep $ punctuate comma $ map ppr $ setElems ps)
   ppr ProcPoint = text "<procpt>"
 
-lattice :: DataflowLattice Status
-lattice = DataflowLattice "direct proc-point reachability" unreached add_to
-    where unreached = ReachedBy setEmpty
-          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
-          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint) -- because of previous case
-          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p')) =
-              let union = setUnion p' p
-              in  if setSize union > setSize p then (SomeChange, ReachedBy union)
-                                               else (NoChange, ReachedBy p)
 --------------------------------------------------
+-- Proc point analysis
+
+procPointAnalysis :: ProcPointSet -> CmmGraph -> UniqSM (BlockEnv Status)
+-- Once you know what the proc-points are, figure out
+-- what proc-points each block is reachable from
+procPointAnalysis procPoints g =
+  -- pprTrace "procPointAnalysis" (ppr procPoints) $
+  dataflowAnalFwdBlocks g initProcPoints $ analFwd lattice forward
+  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+
 -- transfer equations
 
 forward :: FwdTransfer CmmNode Status
-forward = mkFTransfer3 first middle ((mkFactBase lattice . ) . last)
-    where first :: CmmNode C O -> Status -> Status
-          first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
-          first  _ x = x
+forward = mkFTransfer3 first middle last
+    where
+      first :: CmmNode C O -> Status -> Status
+      first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
+      first  _ x = x
 
-          middle _ x = x
+      middle _ x = x
 
-          last :: CmmNode O C -> Status -> [(Label, Status)]
-          last (CmmCall {cml_cont = Just k}) _ = [(k, ProcPoint)]
-          last (CmmForeignCall {succ = k})   _ = [(k, ProcPoint)]
-          last l x = map (\id -> (id, x)) (successors l)
+      last :: CmmNode O C -> Status -> FactBase Status
+      last l x = mkFactBase lattice $ map (\id -> (id, x)) (successors l)
 
--- It is worth distinguishing two sets of proc points:
--- those that are induced by calls in the original graph
--- and those that are introduced because they're reachable from multiple proc points.
+lattice :: DataflowLattice Status
+lattice = DataflowLattice "direct proc-point reachability" unreached add_to
+    where unreached = ReachedBy setEmpty
+          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
+          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint)
+                       -- because of previous case
+          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p'))
+             | setSize union > setSize p = (SomeChange, ReachedBy union)
+             | otherwise                 = (NoChange, ReachedBy p)
+           where
+             union = setUnion p' p
+
+----------------------------------------------------------------------
+
+-- It is worth distinguishing two sets of proc points: those that are
+-- induced by calls in the original graph and those that are
+-- introduced because they're reachable from multiple proc points.
+--
+-- Extract the set of Continuation BlockIds, see Note [Continuation BlockIds].
 callProcPoints      :: CmmGraph -> ProcPointSet
 callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
   where add :: CmmBlock -> BlockSet -> BlockSet
@@ -139,21 +150,17 @@ callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
                       CmmForeignCall {succ=k}     -> setInsert k set
                       _ -> set
 
-minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph -> FuelUniqSM ProcPointSet
+minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
+                    -> UniqSM ProcPointSet
 -- Given the set of successors of calls (which must be proc-points)
 -- figure out the minimal set of necessary proc-points
-minimalProcPointSet platform callProcPoints g = extendPPSet platform g (postorderDfs g) callProcPoints
-
-procPointAnalysis :: ProcPointSet -> CmmGraph -> FuelUniqSM (BlockEnv Status)
--- Once you know what the proc-points are, figure out
--- what proc-points each block is reachable from
-procPointAnalysis procPoints g =
-  liftM snd $ dataflowPassFwd g initProcPoints $ analFwd lattice forward
-  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+minimalProcPointSet platform callProcPoints g
+  = extendPPSet platform g (postorderDfs g) callProcPoints
 
-extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> FuelUniqSM ProcPointSet
+extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> UniqSM ProcPointSet
 extendPPSet platform g blocks procPoints =
     do env <- procPointAnalysis procPoints g
+       -- pprTrace "extensPPSet" (ppr env) $ return ()
        let add block pps = let id = entryLabel block
                            in  case mapLookup id env of
                                  Just ProcPoint -> setInsert id pps
@@ -179,183 +186,13 @@ extendPPSet platform g blocks procPoints =
            pps -> extendPPSet g blocks
                     (foldl extendBlockSet procPoints' pps)
 -}
-       case newPoint of Just id ->
-                          if setMember id procPoints' then panic "added old proc pt"
-                          else extendPPSet platform g blocks (setInsert id procPoints')
-                        Nothing -> return procPoints'
-
-
-------------------------------------------------------------------------
---                    Computing Proc-Point Protocols                  --
-------------------------------------------------------------------------
-
-{-
-
-There is one major trick, discovered by Michael Adams, which is that
-we want to choose protocols in a way that enables us to optimize away
-some continuations.  The optimization is very much like branch-chain
-elimination, except that it involves passing results as well as
-control.  The idea is that if a call's continuation k does nothing but
-CopyIn its results and then goto proc point P, the call's continuation
-may be changed to P, *provided* P's protocol is identical to the
-protocol for the CopyIn.  We choose protocols to make this so.
-
-Here's an explanatory example; we begin with the source code (lines
-separate basic blocks):
-
-  ..1..;
-  x, y = g();
-  goto P;
-  -------
-  P: ..2..;
-
-Zipperization converts this code as follows:
-
-  ..1..;
-  call g() returns to k;
-  -------
-  k: CopyIn(x, y);
-     goto P;
-  -------
-  P: ..2..;
-
-What we'd like to do is assign P the same CopyIn protocol as k, so we
-can eliminate k:
-
-  ..1..;
-  call g() returns to P;
-  -------
-  P: CopyIn(x, y); ..2..;
-
-Of course, P may be the target of more than one continuation, and
-different continuations may have different protocols.  Michael Adams
-implemented a voting mechanism, but he thinks a simple greedy
-algorithm would be just as good, so that's what we do.
-
--}
+       case newPoint of
+         Just id ->
+             if setMember id procPoints'
+                then panic "added old proc pt"
+                else extendPPSet platform g blocks (setInsert id procPoints')
+         Nothing -> return procPoints'
 
-data Protocol = Protocol Convention [CmmFormal] Area
-  deriving Eq
-instance Outputable Protocol where
-  ppr (Protocol c fs a) = text "Protocol" <+> ppr c <+> ppr fs <+> ppr a
-
--- | Function 'optimize_calls' chooses protocols only for those proc
--- points that are relevant to the optimization explained above.
--- The others are assigned by 'add_unassigned', which is not yet clever.
-
-addProcPointProtocols :: ProcPointSet -> ProcPointSet -> CmmGraph -> FuelUniqSM CmmGraph
-addProcPointProtocols callPPs procPoints g =
-  do liveness <- cmmLiveness g
-     (protos, g') <- optimize_calls liveness g
-     blocks'' <- add_CopyOuts protos procPoints g'
-     return $ ofBlockMap (g_entry g) blocks''
-    where optimize_calls liveness g =  -- see Note [Separate Adams optimization]
-            do let (protos, blocks') =
-                       foldGraphBlocks maybe_add_call (mapEmpty, mapEmpty) g
-                   protos' = add_unassigned liveness procPoints protos
-               let g' = ofBlockMap (g_entry g) (add_CopyIns callPPs protos' blocks')
-               return (protos', removeUnreachableBlocks g')
-          maybe_add_call :: CmmBlock -> (BlockEnv Protocol, BlockEnv CmmBlock)
-                         -> (BlockEnv Protocol, BlockEnv CmmBlock)
-          -- ^ If the block is a call whose continuation goes to a proc point
-          -- whose protocol either matches the continuation's or is not yet set,
-          -- redirect the call (cf 'newblock') and set the protocol if necessary
-          maybe_add_call block (protos, blocks) =
-              case lastNode block of
-                CmmCall tgt (Just k) args res s
-                    | Just proto <- mapLookup k protos,
-                      Just pee   <- branchesToProcPoint k
-                    -> let newblock = replaceLastNode block (CmmCall tgt (Just pee)
-                                                                     args res s)
-                           changed_blocks   = insertBlock newblock blocks
-                           unchanged_blocks = insertBlock block    blocks
-                       in case mapLookup pee protos of
-                            Nothing -> (mapInsert pee proto protos, changed_blocks)
-                            Just proto' ->
-                              if proto == proto' then (protos, changed_blocks)
-                              else (protos, unchanged_blocks)
-                _ -> (protos, insertBlock block blocks)
-
-          branchesToProcPoint :: BlockId -> Maybe BlockId
-          -- ^ Tells whether the named block is just a branch to a proc point
-          branchesToProcPoint id =
-              let block = mapLookup id (toBlockMap g) `orElse`
-                                    panic "branch out of graph"
-              in case blockToNodeList block of
-                   (_, [], JustC (CmmBranch pee)) | setMember pee procPoints -> Just pee
-                   _                                                         -> Nothing
-
--- | For now, following a suggestion by Ben Lippmeier, we pass all
--- live variables as arguments, hoping that a clever register
--- allocator might help.
-
-add_unassigned :: BlockEnv CmmLive -> ProcPointSet -> BlockEnv Protocol ->
-                  BlockEnv Protocol
-add_unassigned = pass_live_vars_as_args
-
-pass_live_vars_as_args :: BlockEnv CmmLive -> ProcPointSet ->
-                          BlockEnv Protocol -> BlockEnv Protocol
-pass_live_vars_as_args _liveness procPoints protos = protos'
-  where protos' = setFold addLiveVars protos procPoints
-        addLiveVars :: BlockId -> BlockEnv Protocol -> BlockEnv Protocol
-        addLiveVars id protos =
-            case mapLookup id protos of
-              Just _  -> protos
-              Nothing -> let live = emptyRegSet
-                                    --lookupBlockEnv _liveness id `orElse`
-                                    --panic ("no liveness at block " ++ show id)
-                             formals = uniqSetToList live
-                             prot = Protocol Private formals $ CallArea $ Young id
-                         in  mapInsert id prot protos
-
-
--- | Add copy-in instructions to each proc point that did not arise from a call
--- instruction. (Proc-points that arise from calls already have their copy-in instructions.)
-
-add_CopyIns :: ProcPointSet -> BlockEnv Protocol -> BlockEnv CmmBlock -> BlockEnv CmmBlock
-add_CopyIns callPPs protos blocks = mapFold maybe_insert_CopyIns mapEmpty blocks
-    where maybe_insert_CopyIns block blocks
-             | not $ setMember bid callPPs
-             , Just (Protocol c fs _area) <- mapLookup bid protos
-             = let nodes     = copyInSlot c fs
-                   (h, m, l) = blockToNodeList block
-               in insertBlock (blockOfNodeList (h, nodes ++ m, l)) blocks
-             | otherwise = insertBlock block blocks
-           where bid = entryLabel block
-
-
--- | Add a CopyOut node before each procpoint.
--- If the predecessor is a call, then the copy outs should already be done by the callee.
--- Note: If we need to add copy-out instructions, they may require stack space,
--- so we accumulate a map from the successors to the necessary stack space,
--- then update the successors after we have finished inserting the copy-outs.
-
-add_CopyOuts :: BlockEnv Protocol -> ProcPointSet -> CmmGraph ->
-                FuelUniqSM (BlockEnv CmmBlock)
-add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty) g
-    where mb_copy_out :: CmmBlock -> FuelUniqSM (BlockEnv CmmBlock) ->
-                                     FuelUniqSM (BlockEnv CmmBlock)
-          mb_copy_out b z | entryLabel b == g_entry g = skip b z
-          mb_copy_out b z =
-            case lastNode b of
-              CmmCall {}        -> skip b z -- copy out done by callee
-              CmmForeignCall {} -> skip b z -- copy out done by callee
-              _ -> copy_out b z
-          copy_out b z = foldr trySucc init (successors b) >>= finish
-            where init = (\bmap -> (b, bmap)) `liftM` z
-                  trySucc succId z =
-                    if setMember succId procPoints then
-                      case mapLookup succId protos of
-                        Nothing -> z
-                        Just (Protocol c fs _area) -> insert z succId $ copyOutSlot c fs
-                    else z
-                  insert z succId m =
-                    do (b, bmap) <- z
-                       (b, bs)   <- insertBetween b m succId
-                       -- pprTrace "insert for succ" (ppr succId <> ppr m) $ do
-                       return $ (b, foldl (flip insertBlock) bmap bs)
-                  finish (b, bmap) = return $ insertBlock b bmap
-          skip b bs = insertBlock b `liftM` bs
 
 -- At this point, we have found a set of procpoints, each of which should be
 -- the entry point of a procedure.
@@ -370,10 +207,9 @@ add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty)
 -- ToDo: use the _ret naming convention that the old code generator
 -- used. -- EZY
 splitAtProcPoints :: CLabel -> ProcPointSet-> ProcPointSet -> BlockEnv Status ->
-                     CmmDecl -> FuelUniqSM [CmmDecl]
+                     CmmDecl -> UniqSM [CmmDecl]
 splitAtProcPoints entry_label callPPs procPoints procMap
-                  (CmmProc (TopInfo {info_tbl=info_tbl,
-                                     stack_info=stack_info})
+                  (CmmProc (TopInfo {info_tbl=info_tbl})
                            top_l g@(CmmGraph {g_entry=entry})) =
   do -- Build a map from procpoints to the blocks they reach
      let addBlock b graphEnv =
@@ -384,15 +220,18 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  []   -> graphEnv
                  [id] -> add graphEnv id bid b 
                  _    -> panic "Each block should be reachable from only one ProcPoint"
-             Nothing -> pprPanic "block not reached by a proc point?" (ppr bid)
+             Nothing -> graphEnv
            where bid = entryLabel b
          add graphEnv procId bid b = mapInsert procId graph' graphEnv
                where graph  = mapLookup procId graphEnv `orElse` mapEmpty
                      graph' = mapInsert bid b graph
+
      graphEnv <- return $ foldGraphBlocks addBlock emptyBlockMap g
+
      -- Build a map from proc point BlockId to pairs of:
      --  * Labels for their new procedures
-     --  * Labels for the info tables of their new procedures (only if the proc point is a callPP)
+     --  * Labels for the info tables of their new procedures (only if
+     --    the proc point is a callPP)
      -- Due to common blockification, we may overestimate the set of procpoints.
      let add_label map pp = Map.insert pp lbls map
            where lbls | pp == entry = (entry_label, Just entry_info_lbl)
@@ -401,30 +240,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  entry_info_lbl = cit_lbl info_tbl
          procLabels = foldl add_label Map.empty
                             (filter (flip mapMember (toBlockMap g)) (setElems procPoints))
-     -- For each procpoint, we need to know the SP offset on entry.
-     -- If the procpoint is:
-     --  - continuation of a call, the SP offset is in the call
-     --  - otherwise, 0 (and left out of the spEntryMap)
-     let add_sp_off :: CmmBlock -> BlockEnv CmmStackInfo -> BlockEnv CmmStackInfo
-         add_sp_off b env =
-           case lastNode b of
-             CmmCall {cml_cont = Just succ, cml_ret_args = off, cml_ret_off = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = off, updfr_space = Just updfr_off}) env
-             CmmForeignCall {succ = succ, updfr = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = wORD_SIZE, updfr_space = Just updfr_off}) env
-             _ -> env
-         spEntryMap = foldGraphBlocks add_sp_off (mapInsert entry stack_info emptyBlockMap) g
-         getStackInfo id = mapLookup id spEntryMap `orElse` StackInfo {arg_space = 0, updfr_space = Nothing}
      -- In each new graph, add blocks jumping off to the new procedures,
      -- and replace branches to procpoints with branches to the jump-off blocks
      let add_jump_block (env, bs) (pp, l) =
            do bid <- liftM mkBlockId getUniqueM
-              let b = blockOfNodeList (JustC (CmmEntry bid), [], JustC jump)
-                  StackInfo {arg_space = argSpace, updfr_space = off} = getStackInfo pp
-                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing argSpace 0
-                                 (off `orElse` 0) -- Jump's shouldn't need the offset...
+              let b = blockJoin (CmmEntry bid) emptyBlock jump
+                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing [{-XXX-}] 0 0 0
+                  -- XXX: No regs are live at the call
               return (mapInsert pp bid env, b : bs)
-         add_jumps (newGraphEnv) (ppId, blockEnv) =
+
+         add_jumps newGraphEnv (ppId, blockEnv) =
            do let needed_jumps = -- find which procpoints we currently branch to
                     mapFold add_if_branch_to_pp [] blockEnv
                   add_if_branch_to_pp :: CmmBlock -> [(BlockId, CLabel)] -> [(BlockId, CLabel)]
@@ -441,17 +266,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  foldM add_jump_block (mapEmpty, []) needed_jumps
                   -- update the entry block
               let b = expectJust "block in env" $ mapLookup ppId blockEnv
-                  off = getStackInfo ppId
                   blockEnv' = mapInsert ppId b blockEnv
                   -- replace branches to procpoints with branches to jumps
                   blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
                   -- add the jump blocks to the graph
                   blockEnv''' = foldl (flip insertBlock) blockEnv'' jumpBlocks
-              let g' = (off, ofBlockMap ppId blockEnv''')
+              let g' = ofBlockMap ppId blockEnv'''
               -- pprTrace "g' pre jumps" (ppr g') $ do
               return (mapInsert ppId g' newGraphEnv)
      graphEnv <- foldM add_jumps emptyBlockMap $ mapToList graphEnv
-     let to_proc (bid, (stack_info, g)) = case expectJust "pp label" $ Map.lookup bid procLabels of
+     let to_proc (bid, g) = case expectJust "pp label" $ Map.lookup bid procLabels of
              (lbl, Just info_lbl)
                | bid == entry
                -> CmmProc (TopInfo {info_tbl=info_tbl, stack_info=stack_info})
@@ -462,15 +286,23 @@ splitAtProcPoints entry_label callPPs procPoints procMap
              (lbl, Nothing)
                -> CmmProc (TopInfo {info_tbl=CmmNonInfoTable, stack_info=stack_info})
                           lbl (replacePPIds g)
-         -- References to procpoint IDs can now be replaced with the infotable's label
-         replacePPIds g = mapGraphNodes (id, mapExp repl, mapExp repl) g
+            where
+             stack_info = StackInfo 0 Nothing -- panic "No StackInfo"
+                          -- cannot use panic, this is printed by -ddump-cmmz
+
+         -- References to procpoint IDs can now be replaced with the
+         -- infotable's label
+         replacePPIds g = {-# SCC "replacePPIds" #-}
+                          mapGraphNodes (id, mapExp repl, mapExp repl) g
            where repl e@(CmmLit (CmmBlock bid)) =
                    case Map.lookup bid procLabels of
                      Just (_, Just info_lbl)  -> CmmLit (CmmLabel info_lbl)
                      _ -> e
                  repl e = e
-     -- The C back end expects to see return continuations before the call sites.
-     -- Here, we sort them in reverse order -- it gets reversed later.
+
+     -- The C back end expects to see return continuations before the
+     -- call sites.  Here, we sort them in reverse order -- it gets
+     -- reversed later.
      let (_, block_order) = foldl add_block_num (0::Int, emptyBlockMap) (postorderDfs g)
          add_block_num (i, map) block = (i+1, mapInsert (entryLabel block) i map)
          sort_fn (bid, _) (bid', _) =
@@ -482,6 +314,27 @@ splitAtProcPoints entry_label callPPs procPoints procMap
             procs
 splitAtProcPoints _ _ _ _ t@(CmmData _ _) = return [t]
 
+
+-- Only called from CmmProcPoint.splitAtProcPoints. NB. does a
+-- recursive lookup, see comment below.
+replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
+replaceBranches env cmmg
+  = {-# SCC "replaceBranches" #-}
+    ofBlockMap (g_entry cmmg) $ mapMap f $ toBlockMap cmmg
+  where
+    f block = replaceLastNode block $ last (lastNode block)
+
+    last :: CmmNode O C -> CmmNode O C
+    last (CmmBranch id)          = CmmBranch (lookup id)
+    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
+    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
+    last l@(CmmCall {})          = l
+    last l@(CmmForeignCall {})   = l
+    lookup id = fmap lookup (mapLookup id env) `orElse` id
+            -- XXX: this is a recursive lookup, it follows chains
+            -- until the lookup returns Nothing, at which point we
+            -- return the last BlockId
+
 ----------------------------------------------------------------
 
 {-
diff --git a/compiler/cmm/CmmRewriteAssignments.hs b/compiler/cmm/CmmRewriteAssignments.hs
index ecf3f7e0c3..2f13997771 100644
--- a/compiler/cmm/CmmRewriteAssignments.hs
+++ b/compiler/cmm/CmmRewriteAssignments.hs
@@ -18,23 +18,23 @@ module CmmRewriteAssignments
 import Cmm
 import CmmUtils
 import CmmOpt
-import OptimizationFuel
 import StgCmmUtils
 
-import Control.Monad
+import UniqSupply
 import Platform
 import UniqFM
 import Unique
 import BlockId
 
-import Compiler.Hoopl hiding (Unique)
+import Hoopl
 import Data.Maybe
+import Control.Monad
 import Prelude hiding (succ, zip)
 
 ----------------------------------------------------------------
 --- Main function
 
-rewriteAssignments :: Platform -> CmmGraph -> FuelUniqSM CmmGraph
+rewriteAssignments :: Platform -> CmmGraph -> UniqSM CmmGraph
 rewriteAssignments platform g = do
   -- Because we need to act on forwards and backwards information, we
   -- first perform usage analysis and bake this information into the
@@ -213,7 +213,7 @@ usageTransfer = mkBTransfer3 first middle last
           increaseUsage f r = addToUFM_C combine f r SingleUse
             where combine _ _ = ManyUse
 
-usageRewrite :: BwdRewrite FuelUniqSM (WithRegUsage CmmNode) UsageMap
+usageRewrite :: BwdRewrite UniqSM (WithRegUsage CmmNode) UsageMap
 usageRewrite = mkBRewrite3 first middle last
     where first  _ _ = return Nothing
           middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))
@@ -226,7 +226,7 @@ usageRewrite = mkBRewrite3 first middle last
           last   _ _ = return Nothing
 
 type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode)
-annotateUsage :: CmmGraph -> FuelUniqSM (CmmGraphWithRegUsage)
+annotateUsage :: CmmGraph -> UniqSM (CmmGraphWithRegUsage)
 annotateUsage vanilla_g =
     let g = modifyGraph liftRegUsage vanilla_g
     in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $
@@ -404,8 +404,8 @@ clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
 -- ToDo: Also catch MachOp case
 clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
     | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
-clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (CallArea a') o') t)
+clobbers (CmmStackSlot a o, rhs) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot a' o') t)
             = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
           f (CmmLoad e _)    = containsStackSlot e
           f (CmmMachOp _ es) = or (map f es)
@@ -416,9 +416,6 @@ clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
           containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
           containsStackSlot (CmmStackSlot{}) = True
           containsStackSlot _ = False
-clobbers (CmmStackSlot (RegSlot l) _, _) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (RegSlot l') _) _) = l == l'
-          f _ = False
 clobbers _ (_, e) = f e
     where f (CmmLoad (CmmStackSlot _ _) _) = False
           f (CmmLoad{}) = True -- conservative
@@ -432,7 +429,7 @@ clobbers _ (_, e) = f e
 --      [ I32  ]
 --      [    F64     ]
 --      s'   -w'-    o'
-type CallSubArea = (AreaId, Int, Int) -- area, offset, width
+type CallSubArea = (Area, Int, Int) -- area, offset, width
 overlaps :: CallSubArea -> CallSubArea -> Bool
 overlaps (a, _, _) (a', _, _) | a /= a' = False
 overlaps (_, o, w) (_, o', w') =
@@ -441,7 +438,7 @@ overlaps (_, o, w) (_, o', w') =
     in (s' < o) && (s < o) -- Not LTE, because [ I32  ][ I32  ] is OK
 
 lastAssignment :: WithRegUsage CmmNode O C -> AssignmentMap -> [(Label, AssignmentMap)]
-lastAssignment (Plain (CmmCall _ (Just k) _ _ _)) assign = [(k, invalidateVolatile k assign)]
+lastAssignment (Plain (CmmCall _ (Just k) _ _ _ _)) assign = [(k, invalidateVolatile k assign)]
 lastAssignment (Plain (CmmForeignCall {succ=k}))  assign = [(k, invalidateVolatile k assign)]
 lastAssignment l assign = map (\id -> (id, deleteSinks l assign)) $ successors l
 
@@ -457,7 +454,7 @@ invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap
 invalidateVolatile k m = mapUFM p m
   where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize
             where exp CmmLit{} = True
-                  exp (CmmLoad (CmmStackSlot (CallArea (Young k')) _) _)
+                  exp (CmmLoad (CmmStackSlot (Young k') _) _)
                     | k' == k = False
                   exp (CmmLoad (CmmStackSlot _ _) _) = True
                   exp (CmmMachOp _ es) = and (map exp es)
@@ -527,7 +524,7 @@ assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase ass
 -- values from the assignment map, due to reassignment of the local
 -- register.)  This is probably not locally sound.
 
-assignmentRewrite :: FwdRewrite FuelUniqSM (WithRegUsage CmmNode) AssignmentMap
+assignmentRewrite :: FwdRewrite UniqSM (WithRegUsage CmmNode) AssignmentMap
 assignmentRewrite = mkFRewrite3 first middle last
     where
         first _ _ = return Nothing
@@ -596,10 +593,6 @@ assignmentRewrite = mkFRewrite3 first middle last
                           where rep = typeWidth (localRegType r)
               _ -> old
         -- See Note [Soundness of store rewriting]
-        inlineExp assign old@(CmmLoad (CmmStackSlot (RegSlot r) _) _)
-          = case lookupUFM assign r of
-              Just (AlwaysInline x) -> x
-              _ -> old
         inlineExp _ old = old
 
         inlinable :: CmmNode e x -> Bool
@@ -612,7 +605,7 @@ assignmentRewrite = mkFRewrite3 first middle last
 -- in literals, which we can inline more aggressively, and inlining
 -- gives us opportunities for more folding.  However, we don't need any
 -- facts to do MachOp folding.
-machOpFoldRewrite :: Platform -> FwdRewrite FuelUniqSM (WithRegUsage CmmNode) a
+machOpFoldRewrite :: Platform -> FwdRewrite UniqSM (WithRegUsage CmmNode) a
 machOpFoldRewrite platform = mkFRewrite3 first middle last
   where first _ _ = return Nothing
         middle :: WithRegUsage CmmNode O O -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O O
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
deleted file mode 100644
index 9e762fe48a..0000000000
--- a/compiler/cmm/CmmSpillReload.hs
+++ /dev/null
@@ -1,166 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds, FlexibleContexts #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of this flag in due course
-
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
--- TODO: Get rid of this flag:
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
-module CmmSpillReload
-  ( dualLivenessWithInsertion
-  )
-where
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmLive
-import OptimizationFuel
-
-import Control.Monad
-import Outputable hiding (empty)
-import qualified Outputable as PP
-import UniqSet
-
-import Compiler.Hoopl hiding (Unique)
-import Data.Maybe
-import Prelude hiding (succ, zip)
-
-{- Note [Overview of spill/reload]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The point of this module is to insert spills and reloads to establish
-the invariant that at a call or any proc point with an established
-protocol all live variables not expected in registers are sitting on the
-stack.  We use a backward dual liveness analysis (both traditional
-register liveness as well as register slot liveness on the stack) to
-insert spills and reloads.  It should be followed by a forward
-transformation to sink reloads as deeply as possible, so as to reduce
-register pressure: this transformation is performed by
-CmmRewriteAssignments.
-
-A variable can be expected to be live in a register, live on the
-stack, or both.  This analysis ensures that spills and reloads are
-inserted as needed to make sure that every live variable needed
-after a call is available on the stack.  Spills are placed immediately
-after their reaching definitions, but reloads are placed immediately
-after a return from a call (the entry point.)
-
-Note that we offer no guarantees about the consistency of the value
-in memory and the value in the register, except that they are
-equal across calls/procpoints.  If the variable is changed, this
-mapping breaks: but as the original value of the register may still
-be useful in a different context, the memory location is not updated.
--}
-
-data DualLive = DualLive { on_stack :: RegSet, in_regs :: RegSet }
-
-changeStack, changeRegs :: (RegSet -> RegSet) -> DualLive -> DualLive
-changeStack f live = live { on_stack = f (on_stack live) }
-changeRegs  f live = live { in_regs  = f (in_regs  live) }
-
-dualLiveLattice :: DataflowLattice DualLive
-dualLiveLattice = DataflowLattice "variables live in registers and on stack" empty add
-    where empty = DualLive emptyRegSet emptyRegSet
-          add _ (OldFact old) (NewFact new) = (changeIf $ change1 || change2, DualLive stack regs)
-            where (change1, stack) = add1 (on_stack old) (on_stack new)
-                  (change2, regs)  = add1 (in_regs old)  (in_regs new)
-          add1 old new = if sizeUniqSet join > sizeUniqSet old then (True, join) else (False, old)
-            where join = unionUniqSets old new
-
-dualLivenessWithInsertion :: BlockSet -> CmmGraph -> FuelUniqSM CmmGraph
-dualLivenessWithInsertion procPoints g =
-  liftM fst $ dataflowPassBwd g [] $ analRewBwd dualLiveLattice
-                                                (dualLiveTransfers (g_entry g) procPoints)
-                                                (insertSpillsAndReloads g procPoints)
-
--- Note [Live registers on entry to procpoints]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- Remember that the transfer function is only ever run on the rewritten
--- version of a graph, and the rewrite function for spills and reloads
--- enforces the invariant that no local registers are live on entry to
--- a procpoint.  Accordingly, we check for this invariant here.  An old
--- version of this code incorrectly claimed that any live registers were
--- live on the stack before entering the function: this is wrong, but
--- didn't cause bugs because it never actually was invoked.
-
-dualLiveTransfers :: BlockId -> BlockSet -> (BwdTransfer CmmNode DualLive)
-dualLiveTransfers entry procPoints = mkBTransfer3 first middle last
-    where first :: CmmNode C O -> DualLive -> DualLive
-          first (CmmEntry id) live -- See Note [Live registers on entry to procpoints]
-            | id == entry || setMember id procPoints = noLiveOnEntry id (in_regs live) live
-            | otherwise                              = live
-
-          middle :: CmmNode O O -> DualLive -> DualLive
-          middle m = changeStack updSlots
-                   . changeRegs  updRegs
-            where -- Reuse middle of liveness analysis from CmmLive
-                  updRegs = case getBTransfer3 xferLive of (_, middle, _) -> middle m
-
-                  updSlots live = foldSlotsUsed reload (foldSlotsDefd spill live m) m
-                  spill  live s@(RegSlot r, _, _) = check s $ deleteFromRegSet live r
-                  spill  live _ = live
-                  reload live s@(RegSlot r, _, _) = check s $ extendRegSet live r
-                  reload live _ = live
-                  -- Ensure the assignment refers to the entirety of the
-                  -- register slot (and not just a slice).
-                  check (RegSlot (LocalReg _ ty), o, w) x
-                     | o == w && w == widthInBytes (typeWidth ty) = x
-                  check _ _ = panic "dualLiveTransfers: slices unsupported"
-
-          -- Register analysis is identical to liveness analysis from CmmLive.
-          last :: CmmNode O C -> FactBase DualLive -> DualLive
-          last l fb = changeRegs (gen_kill l) $ case l of
-            CmmCall {cml_cont=Nothing} -> empty
-            CmmCall {cml_cont=Just k}  -> keep_stack_only k
-            CmmForeignCall {succ=k}    -> keep_stack_only k
-            _                          -> joinOutFacts dualLiveLattice l fb
-            where empty = fact_bot dualLiveLattice
-                  lkp k = fromMaybe empty (lookupFact k fb)
-                  keep_stack_only k = DualLive (on_stack (lkp k)) emptyRegSet
-
-insertSpillsAndReloads :: CmmGraph -> BlockSet -> CmmBwdRewrite DualLive
-insertSpillsAndReloads graph procPoints = deepBwdRw3 first middle nothing
-  -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-  -- but GHC miscompiles it, see bug #4044.
-    where first :: CmmNode C O -> Fact O DualLive -> CmmReplGraph C O
-          first e@(CmmEntry id) live = return $
-            if id /= (g_entry graph) && setMember id procPoints then
-              case map reload (uniqSetToList (in_regs live)) of
-                [] -> Nothing
-                is -> Just $ mkFirst e <*> mkMiddles is
-            else Nothing
-          -- EZY: There was some dead code for handling the case where
-          -- we were not splitting procedures.  Check Git history if
-          -- you're interested (circa e26ea0f41).
-
-          middle :: CmmNode O O -> Fact O DualLive -> CmmReplGraph O O
-          -- Don't add spills next to reloads.
-          middle (CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg') _) _)) _ | reg == reg' = return Nothing
-          -- Spill if register is live on stack.
-          middle m@(CmmAssign (CmmLocal reg) _) live
-            | reg `elemRegSet` on_stack live = return (Just (mkMiddles [m, spill reg]))
-          middle _ _ = return Nothing
-
-          nothing _ _ = return Nothing
-
-spill, reload :: LocalReg -> CmmNode O O
-spill  r = CmmStore  (regSlot r) (CmmReg $ CmmLocal r)
-reload r = CmmAssign (CmmLocal r) (CmmLoad (regSlot r) $ localRegType r)
-
----------------------
--- prettyprinting
-
-ppr_regs :: String -> RegSet -> SDoc
-ppr_regs s regs = text s <+> commafy (map ppr $ uniqSetToList regs)
-  where commafy xs = hsep $ punctuate comma xs
-
-instance Outputable DualLive where
-  ppr (DualLive {in_regs = regs, on_stack = stack}) =
-      if isEmptyUniqSet regs && isEmptyUniqSet stack then
-          text "<nothing-live>"
-      else
-          nest 2 $ fsep [if isEmptyUniqSet regs then PP.empty
-                         else (ppr_regs "live in regs =" regs),
-                         if isEmptyUniqSet stack then PP.empty
-                         else (ppr_regs "live on stack =" stack)]
diff --git a/compiler/cmm/CmmStackLayout.hs b/compiler/cmm/CmmStackLayout.hs
deleted file mode 100644
index c7fedad05b..0000000000
--- a/compiler/cmm/CmmStackLayout.hs
+++ /dev/null
@@ -1,591 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of the NoMonoLocalBinds
--- extension in due course
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
--- Todo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-#if __GLASGOW_HASKELL__ >= 703
--- GHC 7.0.1 improved incomplete pattern warnings with GADTs
-{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
-#endif
-
-module CmmStackLayout
-    ( SlotEnv, liveSlotAnal, liveSlotTransfers, removeLiveSlotDefs
-    , getSpEntryMap, layout, manifestSP, igraph, areaBuilder
-    , stubSlotsOnDeath ) -- to help crash early during debugging
-where
-
-import Constants
-import Prelude hiding (succ, zip, unzip, last)
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmProcPoint
-import Maybes
-import MkGraph (stackStubExpr)
-import Control.Monad
-import OptimizationFuel
-import Outputable
-import SMRep (ByteOff)
-
-import Compiler.Hoopl
-
-import Data.Map (Map)
-import qualified Data.Map as Map
-import qualified FiniteMap as Map
-
-------------------------------------------------------------------------
---                    Stack Layout                                    --
-------------------------------------------------------------------------
-
--- | Before we lay out the stack, we need to know something about the
--- liveness of the stack slots. In particular, to decide whether we can
--- reuse a stack location to hold multiple stack slots, we need to know
--- when each of the stack slots is used.
--- Although tempted to use something simpler, we really need a full interference
--- graph. Consider the following case:
---   case <...> of
---     1 -> <spill x>; // y is dead out
---     2 -> <spill y>; // x is dead out
---     3 -> <spill x and y>
--- If we consider the arms in order and we use just the deadness information given by a
--- dataflow analysis, we might decide to allocate the stack slots for x and y
--- to the same stack location, which will lead to incorrect code in the third arm.
--- We won't make this mistake with an interference graph.
-
--- First, the liveness analysis.
--- We represent a slot with an area, an offset into the area, and a width.
--- Tracking the live slots is a bit tricky because there may be loads and stores
--- into only a part of a stack slot (e.g. loading the low word of a 2-word long),
--- e.g. Slot A 0 8 overlaps with Slot A 4 4.
---
--- The definition of a slot set is intended to reduce the number of overlap
--- checks we have to make. There's no reason to check for overlap between
--- slots in different areas, so we segregate the map by Area's.
--- We expect few slots in each Area, so we collect them in an unordered list.
--- To keep these lists short, any contiguous live slots are coalesced into
--- a single slot, on insertion.
-
-slotLattice :: DataflowLattice SubAreaSet
-slotLattice = DataflowLattice "live slots" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case Map.foldRightWithKey addArea (False, old) new of
-                                              (change, x) -> (changeIf change, x)
-        addArea a newSlots z = foldr (addSlot a) z newSlots
-        addSlot a slot (changed, map) =
-          let (c, live) = liveGen slot $ Map.findWithDefault [] a map
-          in (c || changed, Map.insert a live map)
-
-slotLatticeJoin :: [SubAreaSet] -> SubAreaSet
-slotLatticeJoin facts = foldr extend (fact_bot slotLattice) facts
-  where extend fact res = snd $ fact_join slotLattice undefined (OldFact fact) (NewFact res)
-
-type SlotEnv   = BlockEnv SubAreaSet
-  -- The sub-areas live on entry to the block
-
-liveSlotAnal :: CmmGraph -> FuelUniqSM SlotEnv
-liveSlotAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd slotLattice liveSlotTransfers
-
--- Add the subarea s to the subareas in the list-set (possibly coalescing it with
--- adjacent subareas), and also return whether s was a new addition.
-liveGen :: SubArea -> [SubArea] -> (Bool, [SubArea])
-liveGen s set = liveGen' s set []
-  where liveGen' s [] z = (True, s : z)
-        liveGen' s@(a, hi, w) (s'@(a', hi', w') : rst) z =
-          if a /= a' || hi < lo' || lo > hi' then    -- no overlap
-            liveGen' s rst (s' : z)
-          else if s' `contains` s then               -- old contains new
-            (False, set)
-          else                                       -- overlap: coalesce the slots
-            let new_hi = max hi hi'
-                new_lo = min lo lo'
-            in liveGen' (a, new_hi, new_hi - new_lo) rst z
-          where lo  = hi  - w  -- remember: areas grow down
-                lo' = hi' - w'
-        contains (a, hi, w) (a', hi', w') =
-          a == a' && hi >= hi' && hi - w <= hi' - w'
-
-liveKill :: SubArea -> [SubArea] -> [SubArea]
-liveKill (a, hi, w) set = -- pprTrace "killing slots in area" (ppr a) $
-                          liveKill' set []
-  where liveKill' [] z = z
-        liveKill' (s'@(a', hi', w') : rst) z =
-          if a /= a' || hi < lo' || lo > hi' then    -- no overlap
-            liveKill' rst (s' : z)
-          else                                       -- overlap: split the old slot
-            let z'  = if hi' > hi  then (a, hi', hi' - hi)  : z else z
-                z'' = if lo  > lo' then (a, lo,  lo  - lo') : z' else z'
-            in liveKill' rst z''
-          where lo  = hi  - w  -- remember: areas grow down
-                lo' = hi' - w'
-
--- Note: the stack slots that hold variables returned on the stack are not
--- considered live in to the block -- we treat the first node as a definition site.
--- BEWARE?: Am I being a little careless here in failing to check for the
--- entry Id (which would use the CallArea Old).
-liveSlotTransfers :: BwdTransfer CmmNode SubAreaSet
-liveSlotTransfers = mkBTransfer3 frt mid lst
-  where frt :: CmmNode C O -> SubAreaSet -> SubAreaSet
-        frt (CmmEntry l) f = Map.delete (CallArea (Young l)) f
-
-        mid :: CmmNode O O -> SubAreaSet -> SubAreaSet
-        mid n f = foldSlotsUsed addSlot (removeLiveSlotDefs f n) n
-        lst :: CmmNode O C -> FactBase SubAreaSet -> SubAreaSet
-        lst n f = liveInSlots n $ case n of
-          CmmCall {cml_cont=Nothing, cml_args=args} -> add_area (CallArea Old) args out
-          CmmCall {cml_cont=Just k, cml_args=args}  -> add_area (CallArea Old) args (add_area (CallArea (Young k)) args out)
-          CmmForeignCall {succ=k, updfr=oldend}     -> add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
-          _                                         -> out
-         where out = joinOutFacts slotLattice n f
-               add_area _ n live | n == 0 = live
-               add_area a n live = Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
-
--- Slot sets: adding slots, removing slots, and checking for membership.
-liftToArea :: Area -> ([SubArea] -> [SubArea]) -> SubAreaSet -> SubAreaSet 
-addSlot, removeSlot :: SubAreaSet -> SubArea -> SubAreaSet
-elemSlot            :: SubAreaSet -> SubArea -> Bool
-liftToArea a f map = Map.insert a (f (Map.findWithDefault [] a map)) map
-addSlot    live (a, i, w) = liftToArea a (snd . liveGen  (a, i, w)) live
-removeSlot live (a, i, w) = liftToArea a       (liveKill (a, i, w)) live
-elemSlot   live (a, i, w) =
-  not $ fst $ liveGen  (a, i, w) (Map.findWithDefault [] a live)
-
-removeLiveSlotDefs :: (DefinerOfSlots s, UserOfSlots s) => SubAreaSet -> s -> SubAreaSet
-removeLiveSlotDefs = foldSlotsDefd removeSlot
-
-liveInSlots :: (DefinerOfSlots s, UserOfSlots s) => s -> SubAreaSet -> SubAreaSet
-liveInSlots x live = foldSlotsUsed addSlot (removeLiveSlotDefs live x) x
-
-liveLastIn :: CmmNode O C -> (BlockId -> SubAreaSet) -> SubAreaSet
-liveLastIn l env = liveInSlots l (liveLastOut env l)
-
--- Don't forget to keep the outgoing parameters in the CallArea live,
--- as well as the update frame.
--- Note: We have to keep the update frame live at a call because of the
--- case where the function doesn't return -- in that case, there won't
--- be a return to keep the update frame live. We'd still better keep the
--- info pointer in the update frame live at any call site;
--- otherwise we could screw up the garbage collector.
-liveLastOut :: (BlockId -> SubAreaSet) -> CmmNode O C -> SubAreaSet
-liveLastOut env l =
-  case l of
-    CmmCall _ Nothing n _ _ -> 
-      add_area (CallArea Old) n out -- add outgoing args (includes upd frame)
-    CmmCall _ (Just k) n _ _ ->
-      add_area (CallArea Old) n (add_area (CallArea (Young k)) n out)
-    CmmForeignCall { succ = k, updfr = oldend } ->
-      add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
-    _ -> out
-  where out = slotLatticeJoin $ map env $ successors l
-        add_area _ n live | n == 0 = live
-        add_area a n live =
-          Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
-
--- The liveness analysis must be precise: otherwise, we won't know if a definition
--- should really kill a live-out stack slot.
--- But the interference graph does not have to be precise -- it might decide that
--- any live areas interfere. To maintain both a precise analysis and an imprecise
--- interference graph, we need to convert the live-out stack slots to graph nodes
--- at each and every instruction; rather than reconstruct a new list of nodes
--- every time, I provide a function to fold over the nodes, which should be a
--- reasonably efficient approach for the implementations we envision.
--- Of course, it will probably be much easier to program if we just return a list...
-type Set x = Map x ()
-data IGraphBuilder n =
-  Builder { foldNodes     :: forall z. SubArea -> (n -> z -> z) -> z -> z
-          , _wordsOccupied :: AreaSizeMap -> AreaMap -> n -> [Int]
-          }
-
-areaBuilder :: IGraphBuilder Area
-areaBuilder = Builder fold words
-  where fold (a, _, _) f z = f a z
-        words areaSize areaMap a =
-          case Map.lookup a areaMap of
-            Just addr -> [addr .. addr + (Map.lookup a areaSize `orElse`
-                                          pprPanic "wordsOccupied: unknown area" (ppr areaSize <+> ppr a))]
-            Nothing   -> []
-
---slotBuilder :: IGraphBuilder (Area, Int)
---slotBuilder = undefined
-
--- Now, we can build the interference graph.
--- The usual story: a definition interferes with all live outs and all other
--- definitions.
-type IGraph x = Map x (Set x)
-type IGPair x = (IGraph x, IGraphBuilder x)
-igraph :: (Ord x) => IGraphBuilder x -> SlotEnv -> CmmGraph -> IGraph x
-igraph builder env g = foldr interfere Map.empty (postorderDfs g)
-  where foldN = foldNodes builder
-        interfere block igraph = foldBlockNodesB3 (first, middle, last) block igraph
-          where first _ (igraph, _) = igraph
-                middle node (igraph, liveOut) =
-                  (addEdges igraph node liveOut, liveInSlots node liveOut)
-                last node igraph =
-                  (addEdges igraph node $ liveLastOut env' node, liveLastIn node env')
-
-                -- add edges between a def and the other defs and liveouts
-                addEdges igraph i out = fst $ foldSlotsDefd addDef (igraph, out) i
-                addDef (igraph, out) def@(a, _, _) =
-                  (foldN def (addDefN out) igraph,
-                   Map.insert a (snd $ liveGen def (Map.findWithDefault [] a out)) out)
-                addDefN out n igraph =
-                  let addEdgeNO o igraph = foldN o addEdgeNN igraph
-                      addEdgeNN n' igraph = addEdgeNN' n n' $ addEdgeNN' n' n igraph
-                      addEdgeNN' n n' igraph = Map.insert n (Map.insert n' () set) igraph
-                        where set = Map.findWithDefault Map.empty n igraph
-                  in Map.foldRightWithKey (\ _ os igraph -> foldr addEdgeNO igraph os) igraph out
-                env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
-
--- Before allocating stack slots, we need to collect one more piece of information:
--- what's the highest offset (in bytes) used in each Area?
--- We'll need to allocate that much space for each Area.
-
--- Mapping of areas to area sizes (not offsets!)
-type AreaSizeMap = AreaMap
-
--- JD: WHY CAN'T THIS COME FROM THE slot-liveness info?
-getAreaSize :: ByteOff -> CmmGraph -> AreaSizeMap
-  -- The domain of the returned mapping consists only of Areas
-  -- used for (a) variable spill slots, and (b) parameter passing areas for calls
-getAreaSize entry_off g =
-  foldGraphBlocks (foldBlockNodesF3 (first, add_regslots, last))
-              (Map.singleton (CallArea Old) entry_off) g
-  where first _  z = z
-        last :: CmmNode O C -> Map Area Int -> Map Area Int
-        last l@(CmmCall _ Nothing args res _) z  =  add_regslots l (add (add z area args) area res)
-          where area = CallArea Old
-        last l@(CmmCall _ (Just k) args res _) z =  add_regslots l (add (add z area args) area res)
-          where area = CallArea (Young k)
-        last l@(CmmForeignCall {succ = k}) z     =  add_regslots l (add z area wORD_SIZE)
-          where area = CallArea (Young k)
-        last l z                                 =  add_regslots l z
-        add_regslots i z = foldSlotsUsed addSlot (foldSlotsDefd addSlot z i) i
-        addSlot z (a@(RegSlot (LocalReg _ ty)), _, _) =
-          add z a $ widthInBytes $ typeWidth ty
-        addSlot z _ = z
-        add z a off = Map.insert a (max off (Map.findWithDefault 0 a z)) z
-	-- The 'max' is important.  Two calls, to f and g, might share a common
-	-- continuation (and hence a common CallArea), but their number of overflow
-	-- parameters might differ.
-        -- EZY: Ought to use insert with combining function...
-
-
--- Find the Stack slots occupied by the subarea's conflicts
-conflictSlots :: Ord x => IGPair x -> AreaSizeMap -> AreaMap -> SubArea -> Set Int
-conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
-  foldNodes subarea foldNode Map.empty
-  where foldNode n set = Map.foldRightWithKey conflict set $ Map.findWithDefault Map.empty n ig
-        conflict n' () set = liveInSlots areaMap n' set
-        -- Add stack slots occupied by igraph node n
-        liveInSlots areaMap n set = foldr setAdd set (wordsOccupied areaSize areaMap n)
-        setAdd w s = Map.insert w () s
-
--- Find any open space for 'area' on the stack, starting from the
--- 'offset'.  If the area is a CallArea or a spill slot for a pointer,
--- then it must be word-aligned.
-freeSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> Int
-freeSlotFrom ig areaSize offset areaMap area =
-  let size = Map.lookup area areaSize `orElse` 0
-      conflicts = conflictSlots ig areaSize areaMap (area, size, size)
-      -- CallAreas and Ptrs need to be word-aligned (round up!)
-      align = case area of CallArea _                                -> align'
-                           RegSlot  r | isGcPtrType (localRegType r) -> align'
-                           RegSlot  _                                -> id
-      align' n = (n + (wORD_SIZE - 1)) `div` wORD_SIZE * wORD_SIZE
-      -- Find a space big enough to hold the area
-      findSpace curr 0 = curr
-      findSpace curr cnt = -- part of target slot, # of bytes left to check
-        if Map.member curr conflicts then
-          findSpace (align (curr + size)) size -- try the next (possibly) open space
-        else findSpace (curr - 1) (cnt - 1)
-  in findSpace (align (offset + size)) size
-
--- Find an open space on the stack, and assign it to the area.
-allocSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> AreaMap
-allocSlotFrom ig areaSize from areaMap area =
-  if Map.member area areaMap then areaMap
-  else Map.insert area (freeSlotFrom ig areaSize from areaMap area) areaMap
-
--- Figure out all of the offsets from the slot location; this will be
--- non-zero for procpoints.
-type SpEntryMap = BlockEnv Int
-getSpEntryMap :: Int -> CmmGraph -> SpEntryMap
-getSpEntryMap entry_off g@(CmmGraph {g_entry = entry})
-    = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
-  where add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
-        add_sp_off b env =
-          case lastNode b of
-            CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
-            CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
-            _                                              -> env
-
--- | Greedy stack layout.
--- Compute liveness, build the interference graph, and allocate slots for the areas.
--- We visit each basic block in a (generally) forward order.
-
--- At each instruction that names a register subarea r, we immediately allocate
--- any available slot on the stack by the following procedure:
---  1. Find the sub-areas S that conflict with r
---  2. Find the stack slots used for S
---  3. Choose a contiguous stack space s not in S (s must be large enough to hold r)
-
--- For a CallArea, we allocate the stack space only when we reach a function
--- call that returns to the CallArea's blockId.
--- Then, we allocate the Area subject to the following constraints:
---   a) It must be younger than all the sub-areas that are live on entry to the block
---         This constraint is only necessary for the successor of a call
---   b) It must not overlap with any already-allocated Area with which it conflicts
---   	   (ie at some point, not necessarily now, is live at the same time)
---   Part (b) is just the 1,2,3 part above
-
--- Note: The stack pointer only has to be younger than the youngest live stack slot
--- at proc points. Otherwise, the stack pointer can point anywhere.
-
-layout :: ProcPointSet -> SpEntryMap -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
--- The domain of the returned map includes an Area for EVERY block
--- including each block that is not the successor of a call (ie is not a proc-point)
--- That's how we return the info of what the SP should be at the entry of every non
--- procpoint block.  However, note that procpoint blocks have their
--- /slot/ stored, which is not necessarily the value of the SP on entry
--- to the block (in fact, it probably isn't, due to argument passing).
--- See [Procpoint Sp offset]
-
-layout procPoints spEntryMap env entry_off g =
-  let ig = (igraph areaBuilder env g, areaBuilder)
-      env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
-      areaSize = getAreaSize entry_off g
-
-      -- Find the youngest live stack slot that has already been allocated
-      youngest_live :: AreaMap 	   -- Already allocated
-                    -> SubAreaSet  -- Sub-areas live here
-		    -> ByteOff     -- Offset of the youngest byte of any 
-		       		   --    already-allocated, live sub-area
-      youngest_live areaMap live = fold_subareas young_slot live 0
-        where young_slot (a, o, _) z = case Map.lookup a areaMap of
-                                         Just top -> max z $ top + o
-                                         Nothing  -> z
-              fold_subareas f m z = Map.foldRightWithKey (\_ s z -> foldr f z s) z m
-
-      -- Allocate space for spill slots and call areas
-      allocVarSlot = allocSlotFrom ig areaSize 0
-
-      -- Update the successor's incoming SP.
-      setSuccSPs inSp bid areaMap =
-        case (Map.lookup area areaMap , mapLookup bid (toBlockMap g)) of
-          (Just _, _) -> areaMap -- succ already knows incoming SP
-          (Nothing, Just _) ->
-            if setMember bid procPoints then
-              let young = youngest_live areaMap $ env' bid
-                  -- start = case returnOff stackInfo of Just b  -> max b young
-                  --                                     Nothing -> young
-                  start = young -- maybe wrong, but I don't understand
-                                -- why the preceding is necessary...
-              in  allocSlotFrom ig areaSize start areaMap area
-            else Map.insert area inSp areaMap
-          (_, Nothing) -> panic "Block not found in cfg"
-        where area = CallArea (Young bid)
-
-      layoutAreas areaMap block = foldBlockNodesF3 (flip const, allocMid, allocLast (entryLabel block)) block areaMap
-      allocMid m areaMap = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap m) m
-      allocLast bid l areaMap =
-        foldr (setSuccSPs inSp) areaMap' (successors l)
-        where inSp = slot + spOffset -- [Procpoint Sp offset]
-              -- If it's not in the map, we should use our previous
-              -- calculation unchanged.
-              spOffset = mapLookup bid spEntryMap `orElse` 0
-              slot = expectJust "slot in" $ Map.lookup (CallArea (Young bid)) areaMap
-              areaMap' = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap l) l
-      alloc' areaMap (a@(RegSlot _), _, _) = allocVarSlot areaMap a
-      alloc' areaMap _ = areaMap
-
-      initMap = Map.insert (CallArea (Young (g_entry g))) 0
-              . Map.insert (CallArea Old)                 0
-              $ Map.empty
-
-      areaMap = foldl layoutAreas initMap (postorderDfs g)
-  in -- pprTrace "ProcPoints" (ppr procPoints) $
-     -- pprTrace "Area SizeMap" (ppr areaSize) $
-     -- pprTrace "Entry offset" (ppr entry_off) $
-     -- pprTrace "Area Map" (ppr areaMap) $
-     areaMap
-
-{- Note [Procpoint Sp offset]
-
-The calculation of inSp is a little tricky.  (Un)fortunately, if you get
-it wrong, you will get inefficient but correct code.  You know you've
-got it wrong if the generated stack pointer bounces up and down for no
-good reason.
-
-Why can't we just set inSp to the location of the slot?  (This is what
-the code used to do.)  The trouble is when we actually hit the proc
-point the start of the slot will not be the same as the actual Sp due
-to argument passing:
-
-  a:
-      I32[(young<b> + 4)] = cde;
-      // Stack pointer is moved to young end (bottom) of young<b> for call
-      // +-------+
-      // | arg 1 |
-      // +-------+ <- Sp
-      call (I32[foobar::I32])(...) returns to Just b (4) (4) with update frame 4;
-  b:
-      // After call, stack pointer is above the old end (top) of
-      // young<b> (the difference is spOffset)
-      // +-------+ <- Sp
-      // | arg 1 |
-      // +-------+
-
-If we blithely set the Sp to be the same as the slot (the young end of
-young<b>), an adjustment will be necessary when we go to the next block.
-This is wasteful.  So, instead, for the next block after a procpoint,
-the actual Sp should be set to the same as the true Sp when we just
-entered the procpoint.  Then manifestSP will automatically do the right
-thing.
-
-Questions you may ask:
-
-1. Why don't we need to change the mapping for the procpoint itself?
-   Because manifestSP does its own calculation of the true stack value,
-   manifestSP will notice the discrepancy between the actual stack
-   pointer and the slot start, and adjust all of its memory accesses
-   accordingly.  So the only problem is when we adjust the Sp in
-   preparation for the successor block; that's why this code is here and
-   not in setSuccSPs.
-
-2. Why don't we make the procpoint call area and the true offset match
-   up?  If we did that, we would never use memory above the true value
-   of the stack pointer, thus wasting all of the stack we used to store
-   arguments.  You might think that some clever changes to the slot
-   offsets, using negative offsets, might fix it, but this does not make
-   semantic sense.
-
-3. If manifestSP is already calculating the true stack value, why we can't
-   do this trick inside manifestSP itself?  The reason is that if two
-   branches join with inconsistent SPs, one of them has to be fixed: we
-   can't know what the fix should be without already knowing what the
-   chosen location of SP is on the next successor.  (This is
-   the "succ already knows incoming SP" case), This calculation cannot
-   be easily done in manifestSP, since it processes the nodes
-   /backwards/.  So we need to have figured this out before we hit
-   manifestSP.
--}
-
--- After determining the stack layout, we can:
--- 1. Replace references to stack Areas with addresses relative to the stack
---    pointer.
--- 2. Insert adjustments to the stack pointer to ensure that it is at a
---    conventional location at each proc point.
---    Because we don't take interrupts on the execution stack, we only need the
---    stack pointer to be younger than the live values on the stack at proc points.
--- 3. Compute the maximum stack offset used in the procedure and replace
---    the stack high-water mark with that offset.
-manifestSP :: SpEntryMap -> AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
-manifestSP spEntryMap areaMap entry_off g@(CmmGraph {g_entry=entry}) =
-  ofBlockMap entry `liftM` foldl replB (return mapEmpty) (postorderDfs g)
-  where slot a = -- pprTrace "slot" (ppr a) $
-                   Map.lookup a areaMap `orElse` panic "unallocated Area"
-        slot' (Just id) = slot $ CallArea (Young id)
-        slot' Nothing   = slot $ CallArea Old
-        sp_high = maxSlot slot g
-        proc_entry_sp = slot (CallArea Old) + entry_off
-
-        spOffset id = mapLookup id spEntryMap `orElse` 0
-
-        sp_on_entry id | id == entry = proc_entry_sp
-        sp_on_entry id = slot' (Just id) + spOffset id
-
-        -- On entry to procpoints, the stack pointer is conventional;
-        -- otherwise, we check the SP set by predecessors.
-        replB :: FuelUniqSM (BlockEnv CmmBlock) -> CmmBlock -> FuelUniqSM (BlockEnv CmmBlock)
-        replB blocks block =
-          do let (head, middles, JustC tail :: MaybeC C (CmmNode O C)) = blockToNodeList block
-                 middles' = map (middle spIn) middles
-             bs <- replLast head middles' tail
-             flip (foldr insertBlock) bs `liftM` blocks
-          where spIn = sp_on_entry (entryLabel block)
-
-                middle spOff m = mapExpDeep (replSlot spOff) m
-                -- XXX there shouldn't be any global registers in the
-                -- CmmCall, so there shouldn't be any slots in
-                -- CmmCall... check that...
-                last   spOff l = mapExpDeep (replSlot spOff) l
-                replSlot spOff (CmmStackSlot a i) = CmmRegOff (CmmGlobal Sp) (spOff - (slot a + i))
-                replSlot _ (CmmLit CmmHighStackMark) = -- replacing the high water mark
-                  CmmLit (CmmInt (toInteger (max 0 (sp_high - proc_entry_sp))) (typeWidth bWord))
-                -- Invariant: Sp is always greater than SpLim.  Thus, if
-                -- the high water mark is zero, we can optimize away the
-                -- conditional branch.  Relies on dead code elimination
-                -- to get rid of the dead GC blocks.
-                -- EZY: Maybe turn this into a guard that checks if a
-                -- statement is stack-check ish?  Maybe we should make
-                -- an actual mach-op for it, so there's no chance of
-                -- mixing this up with something else...
-                replSlot _ (CmmMachOp (MO_U_Lt _)
-                              [CmmMachOp (MO_Sub _)
-                                         [ CmmReg (CmmGlobal Sp)
-                                         , CmmLit (CmmInt 0 _)],
-                               CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
-                replSlot _ e = e
-
-                replLast :: MaybeC C (CmmNode C O) -> [CmmNode O O] -> CmmNode O C -> FuelUniqSM [CmmBlock]
-                replLast h m l@(CmmCall _ k n _ _)       = updSp (slot' k + n) h m l
-                -- JD: LastForeignCall probably ought to have an outgoing
-                --     arg size, just like LastCall
-                replLast h m l@(CmmForeignCall {succ=k}) = updSp (slot' (Just k) + wORD_SIZE) h m l
-                replLast h m l@(CmmBranch k)             = updSp (sp_on_entry k) h m l
-                replLast h m l                           = uncurry (:) `liftM` foldr succ (return (b, [])) (successors l)
-                  where b :: CmmBlock
-                        b = updSp' spIn h m l
-                        succ succId z =
-                          let succSp = sp_on_entry succId in
-                          if succSp /= spIn then
-                            do (b,  bs)  <- z
-                               (b', bs') <- insertBetween b (adjustSp succSp) succId
-                               return (b', bs' ++ bs)
-                          else z
-
-                updSp sp h m l = return [updSp' sp h m l]
-                updSp' sp h m l | sp == spIn = blockOfNodeList (h, m, JustC $ last sp l)
-                                | otherwise  = blockOfNodeList (h, m ++ adjustSp sp, JustC $ last sp l)
-                adjustSp sp = [CmmAssign (CmmGlobal Sp) e]
-                  where e = CmmMachOp (MO_Add wordWidth) [CmmReg (CmmGlobal Sp), off]
-                        off = CmmLit $ CmmInt (toInteger $ spIn - sp) wordWidth
-
-
--- To compute the stack high-water mark, we fold over the graph and
--- compute the highest slot offset.
-maxSlot :: (Area -> Int) -> CmmGraph -> Int
-maxSlot slotOff g = foldGraphBlocks (foldBlockNodesF3 (flip const, highSlot, highSlot)) 0 g
-  where highSlot i z = foldSlotsUsed add (foldSlotsDefd add z i) i
-        add z (a, i, _) = max z (slotOff a + i)
-
------------------------------------------------------------------------------
--- | Sanity check: stub pointers immediately after they die
------------------------------------------------------------------------------
--- This will miss stack slots that are last used in a Last node,
--- but it should do pretty well...
-
-stubSlotsOnDeath :: CmmGraph -> FuelUniqSM CmmGraph
-stubSlotsOnDeath g = liftM fst $ dataflowPassBwd g [] $ analRewBwd slotLattice
-                                                                   liveSlotTransfers
-                                                                   rewrites
-    where rewrites = mkBRewrite3 frt mid lst
-          frt _ _ = return Nothing
-          mid m liveSlots = return $ foldSlotsUsed (stub liveSlots m) Nothing m
-          lst _ _ = return Nothing
-          stub liveSlots m rst subarea@(a, off, w) =
-            if elemSlot liveSlots subarea then rst
-            else let store = mkMiddle $ CmmStore (CmmStackSlot a off)
-                                                 (stackStubExpr (widthFromBytes w))
-                 in case rst of Nothing -> Just (mkMiddle m <*> store)
-                                Just g  -> Just (g <*> store)
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 0756c87583..f2e4d8e183 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -60,13 +60,14 @@ module CmmUtils(
         -- * Operations that probably don't belong here
         modifyGraph,
 
-        lastNode, replaceLastNode, insertBetween,
+        lastNode, replaceLastNode,
         ofBlockMap, toBlockMap, insertBlock,
         ofBlockList, toBlockList, bodyToBlockList,
         foldGraphBlocks, mapGraphNodes, postorderDfs, mapGraphNodes1,
       
         analFwd, analBwd, analRewFwd, analRewBwd,
-        dataflowPassFwd, dataflowPassBwd
+        dataflowPassFwd, dataflowPassBwd, dataflowAnalFwd, dataflowAnalBwd,
+        dataflowAnalFwdBlocks
   ) where
 
 #include "HsVersions.h"
@@ -79,7 +80,6 @@ import Cmm
 import BlockId
 import CLabel
 import Outputable
-import OptimizationFuel as F
 import Unique
 import UniqSupply
 import Constants( wORD_SIZE, tAG_MASK )
@@ -88,8 +88,7 @@ import Util
 import Data.Word
 import Data.Maybe
 import Data.Bits
-import Control.Monad
-import Compiler.Hoopl hiding ( Unique )
+import Hoopl
 
 ---------------------------------------------------
 --
@@ -402,13 +401,13 @@ mkLiveness (reg:regs)
 modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
 modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
 
-toBlockMap :: CmmGraph -> LabelMap CmmBlock
+toBlockMap :: CmmGraph -> BlockEnv CmmBlock
 toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 
-ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
+ofBlockMap :: BlockId -> BlockEnv CmmBlock -> CmmGraph
 ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
 
-insertBlock :: CmmBlock -> LabelMap CmmBlock -> LabelMap CmmBlock
+insertBlock :: CmmBlock -> BlockEnv CmmBlock -> BlockEnv CmmBlock
 insertBlock block map =
   ASSERT (isNothing $ mapLookup id map)
   mapInsert id block map
@@ -418,7 +417,8 @@ toBlockList :: CmmGraph -> [CmmBlock]
 toBlockList g = mapElems $ toBlockMap g
 
 ofBlockList :: BlockId -> [CmmBlock] -> CmmGraph
-ofBlockList entry blocks = CmmGraph {g_entry=entry, g_graph=GMany NothingO body NothingO}
+ofBlockList entry blocks = CmmGraph { g_entry = entry
+                                    , g_graph = GMany NothingO body NothingO }
   where body = foldr addBlock emptyBody blocks
 
 bodyToBlockList :: Body CmmNode -> [CmmBlock]
@@ -429,97 +429,77 @@ mapGraphNodes :: ( CmmNode C O -> CmmNode C O
                  , CmmNode O C -> CmmNode O C)
               -> CmmGraph -> CmmGraph
 mapGraphNodes funs@(mf,_,_) g =
-  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (blockMapNodes3 funs) $ toBlockMap g
+  ofBlockMap (entryLabel $ mf $ CmmEntry $ g_entry g) $ mapMap (mapBlock3' funs) $ toBlockMap g
 
 mapGraphNodes1 :: (forall e x. CmmNode e x -> CmmNode e x) -> CmmGraph -> CmmGraph
-mapGraphNodes1 f g = modifyGraph (graphMapBlocks (blockMapNodes f)) g
+mapGraphNodes1 f = modifyGraph (mapGraph f)
 
 
 foldGraphBlocks :: (CmmBlock -> a -> a) -> a -> CmmGraph -> a
 foldGraphBlocks k z g = mapFold k z $ toBlockMap g
 
 postorderDfs :: CmmGraph -> [CmmBlock]
-postorderDfs g = postorder_dfs_from (toBlockMap g) (g_entry g)
-
--------------------------------------------------
--- Manipulating CmmBlocks
-
-lastNode :: CmmBlock -> CmmNode O C
-lastNode block = foldBlockNodesF3 (nothing, nothing, const) block ()
-  where nothing :: a -> b -> ()
-        nothing _ _ = ()
-
-replaceLastNode :: Block CmmNode e C -> CmmNode O C -> Block CmmNode e C
-replaceLastNode block last = blockOfNodeList (first, middle, JustC last)
-  where (first, middle, _) = blockToNodeList block
-
-----------------------------------------------------------------------
------ Splicing between blocks
--- Given a middle node, a block, and a successor BlockId,
--- we can insert the middle node between the block and the successor.
--- We return the updated block and a list of new blocks that must be added
--- to the graph.
--- The semantics is a bit tricky. We consider cases on the last node:
--- o For a branch, we can just insert before the branch,
---   but sometimes the optimizer does better if we actually insert
---   a fresh basic block, enabling some common blockification.
--- o For a conditional branch, switch statement, or call, we must insert
---   a new basic block.
--- o For a jump or return, this operation is impossible.
-
-insertBetween :: MonadUnique m => CmmBlock -> [CmmNode O O] -> BlockId -> m (CmmBlock, [CmmBlock])
-insertBetween b ms succId = insert $ lastNode b
-  where insert :: MonadUnique m => CmmNode O C -> m (CmmBlock, [CmmBlock])
-        insert (CmmBranch bid) =
-          if bid == succId then
-            do (bid', bs) <- newBlocks
-               return (replaceLastNode b (CmmBranch bid'), bs)
-          else panic "tried invalid block insertBetween"
-        insert (CmmCondBranch c t f) =
-          do (t', tbs) <- if t == succId then newBlocks else return $ (t, [])
-             (f', fbs) <- if f == succId then newBlocks else return $ (f, [])
-             return (replaceLastNode b (CmmCondBranch c t' f'), tbs ++ fbs)
-        insert (CmmSwitch e ks) =
-          do (ids, bs) <- mapAndUnzipM mbNewBlocks ks
-             return (replaceLastNode b (CmmSwitch e ids), join bs)
-        insert (CmmCall {}) =
-          panic "unimp: insertBetween after a call -- probably not a good idea"
-        insert (CmmForeignCall {}) =
-          panic "unimp: insertBetween after a foreign call -- probably not a good idea"
-
-        newBlocks :: MonadUnique m => m (BlockId, [CmmBlock])
-        newBlocks = do id <- liftM mkBlockId $ getUniqueM
-                       return $ (id, [blockOfNodeList (JustC (CmmEntry id), ms, JustC (CmmBranch succId))])
-        mbNewBlocks :: MonadUnique m => Maybe BlockId -> m (Maybe BlockId, [CmmBlock])
-        mbNewBlocks (Just k) = if k == succId then liftM fstJust newBlocks
-                               else return (Just k, [])
-        mbNewBlocks Nothing  = return (Nothing, [])
-        fstJust (id, bs) = (Just id, bs)
+postorderDfs g = {-# SCC "postorderDfs" #-} postorder_dfs_from (toBlockMap g) (g_entry g)
 
 -------------------------------------------------
 -- Running dataflow analysis and/or rewrites
 
 -- Constructing forward and backward analysis-only pass
-analFwd    :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdPass m n f
-analBwd    :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdPass m n f
+analFwd    :: DataflowLattice f -> FwdTransfer n f -> FwdPass UniqSM n f
+analBwd    :: DataflowLattice f -> BwdTransfer n f -> BwdPass UniqSM n f
 
 analFwd lat xfer = analRewFwd lat xfer noFwdRewrite
 analBwd lat xfer = analRewBwd lat xfer noBwdRewrite
 
 -- Constructing forward and backward analysis + rewrite pass
-analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdRewrite m n f -> FwdPass m n f
-analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdRewrite m n f -> BwdPass m n f
+analRewFwd :: DataflowLattice f -> FwdTransfer n f
+           -> FwdRewrite UniqSM n f
+           -> FwdPass UniqSM n f
+
+analRewBwd :: DataflowLattice f
+           -> BwdTransfer n f
+           -> BwdRewrite UniqSM n f
+           -> BwdPass UniqSM n f
 
 analRewFwd lat xfer rew = FwdPass {fp_lattice = lat, fp_transfer = xfer, fp_rewrite = rew}
 analRewBwd lat xfer rew = BwdPass {bp_lattice = lat, bp_transfer = xfer, bp_rewrite = rew}
 
 -- Running forward and backward dataflow analysis + optional rewrite
-dataflowPassFwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> FwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowPassFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
 
-dataflowPassBwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> BwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowAnalFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd =
+  analyzeFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+
+dataflowAnalFwdBlocks :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass UniqSM n f
+                -> UniqSM (BlockEnv f)
+dataflowAnalFwdBlocks (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
+--  (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+--  return facts
+  return (analyzeFwdBlocks fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts))
+
+dataflowAnalBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> BlockEnv f
+dataflowAnalBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd =
+  analyzeBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
+
+dataflowPassBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass UniqSM n f
+                -> UniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
diff --git a/compiler/cmm/Hoopl.hs b/compiler/cmm/Hoopl.hs
new file mode 100644
index 0000000000..0eca85cb8a
--- /dev/null
+++ b/compiler/cmm/Hoopl.hs
@@ -0,0 +1,125 @@
+module Hoopl (
+    module Compiler.Hoopl,
+    module Hoopl.Dataflow,
+    deepFwdRw, deepFwdRw3,
+    deepBwdRw, deepBwdRw3,
+    thenFwdRw
+  ) where
+
+import Compiler.Hoopl hiding
+  ( Unique,
+    FwdTransfer(..), FwdRewrite(..), FwdPass(..),
+    BwdTransfer(..), BwdRewrite(..), BwdPass(..),
+    noFwdRewrite, noBwdRewrite,
+    analyzeAndRewriteFwd, analyzeAndRewriteBwd,
+    mkFactBase, Fact,
+    mkBRewrite, mkBRewrite3, mkBTransfer, mkBTransfer3,
+    mkFRewrite, mkFRewrite3, mkFTransfer, mkFTransfer3,
+    deepFwdRw, deepFwdRw3, thenFwdRw, iterFwdRw,
+    deepBwdRw, deepBwdRw3, thenBwdRw, iterBwdRw
+  )
+
+import Hoopl.Dataflow
+import Control.Monad
+import UniqSupply
+
+deepFwdRw3 :: (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+           -> (FwdRewrite UniqSM n f)
+deepFwdRw :: (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x))) -> FwdRewrite UniqSM n f
+deepFwdRw3 f m l = iterFwdRw $ mkFRewrite3 f m l
+deepFwdRw f = deepFwdRw3 f f f
+
+-- N.B. rw3, rw3', and rw3a are triples of functions.
+-- But rw and rw' are single functions.
+thenFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f 
+          -> FwdRewrite UniqSM n f
+thenFwdRw rw3 rw3' = wrapFR2 thenrw rw3 rw3'
+ where
+  thenrw :: forall e x t t1.
+               (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> (t -> t1 -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+            -> t
+            -> t1
+            -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+  thenrw rw rw' n f = rw n f >>= fwdRes
+     where fwdRes Nothing   = rw' n f
+           fwdRes (Just gr) = return $ Just $ fadd_rw rw3' gr
+
+iterFwdRw :: forall n f.
+             FwdRewrite UniqSM n f
+          -> FwdRewrite UniqSM n f
+iterFwdRw rw3 = wrapFR iter rw3
+ where iter :: forall a e x t.
+               (t -> a -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+               -> t
+               -> a
+               -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f))
+       iter rw n = (liftM $ liftM $ fadd_rw (iterFwdRw rw3)) . rw n
+
+-- | Function inspired by 'rew' in the paper
+_frewrite_cps :: ((Graph n e x, FwdRewrite UniqSM n f) -> UniqSM a)
+             -> UniqSM a
+             -> (forall e x . n e x -> f -> UniqSM (Maybe (Graph n e x, FwdRewrite UniqSM n f)))
+             -> n e x
+             -> f
+             -> UniqSM a
+_frewrite_cps j n rw node f =
+    do mg <- rw node f
+       case mg of Nothing -> n
+                  Just gr -> j gr
+
+
+
+-- | Function inspired by 'add' in the paper
+fadd_rw :: FwdRewrite UniqSM n f
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+        -> (Graph n e x, FwdRewrite UniqSM n f)
+fadd_rw rw2 (g, rw1) = (g, rw1 `thenFwdRw` rw2)
+
+
+
+deepBwdRw3 ::
+              (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+           -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+           -> (BwdRewrite UniqSM n f)
+deepBwdRw  :: (forall e x . n e x -> Fact x f -> UniqSM (Maybe (Graph n e x)))
+           -> BwdRewrite UniqSM n f
+deepBwdRw3 f m l = iterBwdRw $ mkBRewrite3 f m l
+deepBwdRw  f = deepBwdRw3 f f f
+
+
+thenBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+thenBwdRw rw1 rw2 = wrapBR2 f rw1 rw2
+  where f :: forall t t1 t2 e x.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw1 rw2' n f = do
+          res1 <- rw1 n f
+          case res1 of
+            Nothing -> rw2' n f
+            Just gr -> return $ Just $ badd_rw rw2 gr
+
+iterBwdRw :: forall n f. BwdRewrite UniqSM n f -> BwdRewrite UniqSM n f
+iterBwdRw rw = wrapBR f rw
+  where f :: forall t e x t1 t2.
+             t
+             -> (t1 -> t2 -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f)))
+             -> t1
+             -> t2
+             -> UniqSM (Maybe (Graph n e x, BwdRewrite UniqSM n f))
+        f _ rw' n f = liftM (liftM (badd_rw (iterBwdRw rw))) (rw' n f)
+
+-- | Function inspired by 'add' in the paper
+badd_rw :: BwdRewrite UniqSM n f
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+        -> (Graph n e x, BwdRewrite UniqSM n f)
+badd_rw rw2 (g, rw1) = (g, rw1 `thenBwdRw` rw2)
diff --git a/compiler/cmm/Hoopl/Dataflow.hs b/compiler/cmm/Hoopl/Dataflow.hs
new file mode 100644
index 0000000000..e35beb93e9
--- /dev/null
+++ b/compiler/cmm/Hoopl/Dataflow.hs
@@ -0,0 +1,887 @@
+--
+-- Copyright (c) 2010, João Dias, Simon Marlow, Simon Peyton Jones,
+-- and Norman Ramsey
+--
+-- Modifications copyright (c) The University of Glasgow 2012
+--
+-- This module is a specialised and optimised version of
+-- Compiler.Hoopl.Dataflow in the hoopl package.  In particular it is
+-- specialised to the UniqSM monad.
+--
+
+{-# LANGUAGE RankNTypes, ScopedTypeVariables, GADTs, EmptyDataDecls, PatternGuards, TypeFamilies, MultiParamTypeClasses #-}
+#if __GLASGOW_HASKELL__ >= 703
+{-# OPTIONS_GHC -fprof-auto-top #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 701
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+module Hoopl.Dataflow
+  ( DataflowLattice(..), OldFact(..), NewFact(..), Fact, mkFactBase
+  , ChangeFlag(..)
+  , FwdPass(..), FwdTransfer, mkFTransfer, mkFTransfer3, getFTransfer3
+  -- * Respecting Fuel
+
+  -- $fuel
+  , FwdRewrite,  mkFRewrite,  mkFRewrite3,  getFRewrite3, noFwdRewrite
+  , wrapFR, wrapFR2
+  , BwdPass(..), BwdTransfer, mkBTransfer, mkBTransfer3, getBTransfer3
+  , wrapBR, wrapBR2
+  , BwdRewrite,  mkBRewrite,  mkBRewrite3,  getBRewrite3, noBwdRewrite
+  , analyzeAndRewriteFwd,  analyzeAndRewriteBwd
+  , analyzeFwd, analyzeFwdBlocks, analyzeBwd
+  )
+where
+
+import UniqSupply
+
+import Data.Maybe
+import Data.Array
+
+import Compiler.Hoopl hiding
+   ( mkBRewrite3, mkFRewrite3, noFwdRewrite, noBwdRewrite
+   , analyzeAndRewriteBwd, analyzeAndRewriteFwd
+   )
+import Compiler.Hoopl.Internals
+  ( wrapFR, wrapFR2
+  , wrapBR, wrapBR2
+  , splice
+  )
+
+
+-- -----------------------------------------------------------------------------
+
+noRewrite :: a -> b -> UniqSM (Maybe c)
+noRewrite _ _ = return Nothing
+
+noFwdRewrite :: FwdRewrite UniqSM n f
+noFwdRewrite = FwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+-- | Functions passed to 'mkFRewrite3' should not be aware of the fuel supply.
+-- The result returned by 'mkFRewrite3' respects fuel.
+mkFRewrite3 :: forall n f.
+               (n C O -> f -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> f -> UniqSM (Maybe (Graph n O C)))
+            -> FwdRewrite UniqSM n f
+mkFRewrite3 f m l = FwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, FwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noFwdRewrite))
+
+noBwdRewrite :: BwdRewrite UniqSM n f
+noBwdRewrite = BwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+mkBRewrite3 :: forall n f.
+               (n C O -> f          -> UniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f          -> UniqSM (Maybe (Graph n O O)))
+            -> (n O C -> FactBase f -> UniqSM (Maybe (Graph n O C)))
+            -> BwdRewrite UniqSM n f
+mkBRewrite3 f m l = BwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> UniqSM (Maybe a))
+                             -> t -> t1 -> UniqSM (Maybe (a, BwdRewrite UniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> return (Just (a,noBwdRewrite))
+
+-----------------------------------------------------------------------------
+--              Analyze and rewrite forward: the interface
+-----------------------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeAndRewriteFwd
+   :: forall n f e x .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n  e x -> Fact e f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO x f)
+analyzeAndRewriteFwd pass entries g f =
+  do (rg, fout) <- arfGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedExitFact g' fout)
+
+distinguishedExitFact :: forall n e x f . Graph n e x -> Fact x f -> MaybeO x f
+distinguishedExitFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO x f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany _ _ x) = case x of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+----------------------------------------------------------------
+--       Forward Implementation
+----------------------------------------------------------------
+
+type Entries e = MaybeC e [Label]
+
+arfGraph :: forall n f e x .  NonLocal n =>
+            FwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+arfGraph pass@FwdPass { fp_lattice = lattice,
+                        fp_transfer = transfer,
+                        fp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARF  thing = forall e x . thing e x -> f        -> m (DG f n e x, Fact x f)
+    type ARFX thing = forall e x . thing e x -> Fact e f -> m (DG f n e x, Fact x f)
+    -}
+    graph ::              Graph n e x -> Fact e f -> UniqSM (DG f n e x, Fact x f)
+    block :: forall e x .
+             Block n e x -> f -> UniqSM (DG f n e x, Fact x f)
+
+    body  :: [Label] -> LabelMap (Block n C C)
+          -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+
+    cat :: forall e a x f1 f2 f3.
+           (f1 -> UniqSM (DG f n e a, f2))
+        -> (f2 -> UniqSM (DG f n a x, f3))
+        -> (f1 -> UniqSM (DG f n e x, f3))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+      exit  :: MaybeO x (Block n C O)           -> Fact C f -> UniqSM (DG f n C x, Fact x f)
+      exit (JustO blk) f = arfx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact e f -> UniqSM (DG f n e C, Fact C f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` node n) f
+    block (BCons n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node :: forall e x . (ShapeLifter e x)
+         => n e x -> f -> UniqSM (DG f n e x, Fact x f)
+    node n f
+     = do { grw <- frewrite rewrite n f
+          ; case grw of
+              Nothing -> return ( singletonDG f n
+                                , ftransfer transfer n f )
+              Just (g, rw) ->
+                  let pass' = pass { fp_rewrite = rw }
+                      f'    = fwdEntryFact n f
+                  in  arfGraph pass' (fwdEntryLabel n) g f' }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g1,f1) <- ft1 f
+                       ; (g2,f2) <- ft2 f1
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f2) }
+
+    arfx :: forall x .
+            (Block n C x ->        f -> UniqSM (DG f n C x, Fact x f))
+         -> (Block n C x -> Fact C f -> UniqSM (DG f n C x, Fact x f))
+    arfx arf thing fb = 
+      arf thing $ fromJust $ lookupFact (entryLabel thing) $ joinInFacts lattice fb
+     -- joinInFacts adds debugging information
+
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Fwd lattice do_block entries blockmap init_fbase
+      where
+        lattice = fp_lattice pass
+        do_block :: forall x . Block n C x -> FactBase f
+                 -> UniqSM (DG f n C x, Fact x f)
+        do_block b fb = block b entryFact
+          where entryFact = getFact lattice (entryLabel b) fb
+
+
+-- Join all the incoming facts with bottom.
+-- We know the results _shouldn't change_, but the transfer
+-- functions might, for example, generate some debugging traces.
+joinInFacts :: DataflowLattice f -> FactBase f -> FactBase f
+joinInFacts (lattice @ DataflowLattice {fact_bot = bot, fact_join = fj}) fb =
+  mkFactBase lattice $ map botJoin $ mapToList fb
+    where botJoin (l, f) = (l, snd $ fj l (OldFact bot) (NewFact f))
+
+forwardBlockList :: (NonLocal n)
+                 => [Label] -> Body n -> [Block n C C]
+-- This produces a list of blocks in order suitable for forward analysis,
+-- along with the list of Labels it may depend on for facts.
+forwardBlockList entries blks = postorder_dfs_from blks entries
+
+----------------------------------------------------------------
+--       Forward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwd
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwd FwdPass { fp_lattice = lattice,
+                     fp_transfer = FwdTransfer3 (ftr, mtr, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = (ftr l `cat` (block b `cat` ltr n)) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` mtr n) f
+    block (BCons n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwdBlocks
+   :: forall n f e .  NonLocal n =>
+      FwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwdBlocks FwdPass { fp_lattice = lattice,
+                           fp_transfer = FwdTransfer3 (ftr, _, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n _)   f = ftr n f
+    block (BlockCC l _ n) f = (ftr l `cat` ltr n) f
+    block (BlockOC   _ n) f = ltr n f
+    block _               _ = error "analyzeFwdBlocks"
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+----------------------------------------------------------------
+--       Backward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeBwd
+   :: forall n f e .  NonLocal n =>
+      BwdPass UniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact C f
+   -> FactBase f
+analyzeBwd BwdPass { bp_lattice = lattice,
+                     bp_transfer = BwdTransfer3 (ftr, mtr, ltr) }
+   entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact C f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Bwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> Fact x f -> FactBase f
+           do_block b fb = mapSingleton (entryLabel b) (block b fb)
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> Fact x f -> f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = ((ftr l `cat` block b) `cat` ltr n) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` mtr n) f
+    block (BCons n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f2 -> f3) -> (f1 -> f2) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft1 $! ft2 f
+
+-----------------------------------------------------------------------------
+--              Backward analysis and rewriting: the interface
+-----------------------------------------------------------------------------
+
+
+-- | if the graph being analyzed is open at the exit, I don't
+--   quite understand the implications of possible other exits
+analyzeAndRewriteBwd
+   :: NonLocal n
+   => BwdPass UniqSM n f
+   -> MaybeC e [Label] -> Graph n e x -> Fact x f
+   -> UniqSM (Graph n e x, FactBase f, MaybeO e f)
+analyzeAndRewriteBwd pass entries g f =
+  do (rg, fout) <- arbGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedEntryFact g' fout)
+
+distinguishedEntryFact :: forall n e x f . Graph n e x -> Fact e f -> MaybeO e f
+distinguishedEntryFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO e f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany e _ _) = case e of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+
+-----------------------------------------------------------------------------
+--              Backward implementation
+-----------------------------------------------------------------------------
+
+arbGraph :: forall n f e x .
+            NonLocal n =>
+            BwdPass UniqSM n f ->
+            Entries e -> Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+arbGraph pass@BwdPass { bp_lattice  = lattice,
+                        bp_transfer = transfer,
+                        bp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARB  thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, f)
+    type ARBX thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, Fact e f)
+    -}
+    graph ::              Graph n e x -> Fact x f -> UniqSM (DG f n e x, Fact e f)
+    block :: forall e x . Block n e x -> Fact x f -> UniqSM (DG f n e x, f)
+    body  :: [Label] -> Body n -> Fact C f -> UniqSM (DG f n C C, Fact C f)
+    node  :: forall e x . (ShapeLifter e x) 
+             => n e x       -> Fact x f -> UniqSM (DG f n e x, f)
+    cat :: forall e a x info info' info''.
+           (info' -> UniqSM (DG f n e a, info''))
+        -> (info  -> UniqSM (DG f n a x, info'))
+        -> (info  -> UniqSM (DG f n e x, info''))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+      exit  :: MaybeO x (Block n C O)           -> Fact x f -> UniqSM (DG f n C x, Fact C f)
+      exit (JustO blk) f = arbx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact C f -> UniqSM (DG f n e C, Fact e f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BSnoc h n)     f = (block h  `cat` node n) f
+    block (BCons n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node n f
+      = do { bwdres <- brewrite rewrite n f
+           ; case bwdres of
+               Nothing -> return (singletonDG entry_f n, entry_f)
+                            where entry_f = btransfer transfer n f
+               Just (g, rw) ->
+                          do { let pass' = pass { bp_rewrite = rw }
+                             ; (g, f) <- arbGraph pass' (fwdEntryLabel n) g f
+                             ; return (g, bwdEntryFact lattice n f)} }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g2,f2) <- ft2 f
+                       ; (g1,f1) <- ft1 f2
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f1) }
+
+    arbx :: forall x .
+            (Block n C x -> Fact x f -> UniqSM (DG f n C x, f))
+         -> (Block n C x -> Fact x f -> UniqSM (DG f n C x, Fact C f))
+
+    arbx arb thing f = do { (rg, f) <- arb thing f
+                          ; let fb = joinInFacts (bp_lattice pass) $
+                                     mapSingleton (entryLabel thing) f
+                          ; return (rg, fb) }
+     -- joinInFacts adds debugging information
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Bwd (bp_lattice pass) do_block entries blockmap init_fbase
+      where
+        do_block :: forall x. Block n C x -> Fact x f -> UniqSM (DG f n C x, LabelMap f)
+        do_block b f = do (g, f) <- block b f
+                          return (g, mapSingleton (entryLabel b) f)
+
+
+{-
+
+The forward and backward cases are not dual.  In the forward case, the
+entry points are known, and one simply traverses the body blocks from
+those points.  In the backward case, something is known about the exit
+points, but this information is essentially useless, because we don't
+actually have a dual graph (that is, one with edges reversed) to
+compute with.  (Even if we did have a dual graph, it would not avail
+us---a backward analysis must include reachable blocks that don't
+reach the exit, as in a procedure that loops forever and has side
+effects.)
+
+-}
+
+-----------------------------------------------------------------------------
+--      fixpoint
+-----------------------------------------------------------------------------
+
+data Direction = Fwd | Bwd
+
+-- | fixpointing for analysis-only
+--
+fixpointAnal :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> Fact C f)
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> Fact C f -> FactBase f
+
+fixpointAnal direction DataflowLattice{ fact_bot = _, fact_join = join }
+              do_block entries blockmap init_fbase
+  = loop start init_fbase
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap      -- blocks still to analyse
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> FactBase f
+
+    loop []        fbase = fbase
+    loop (ix:todo) fbase =
+           let
+               blk = block_arr ! ix
+
+               out_facts = {-# SCC "do_block" #-} do_block blk fbase
+
+               !(todo', fbase') = {-# SCC "mapFoldWithKey" #-}
+                     mapFoldWithKey (updateFact join dep_blocks)
+                                    (todo,fbase) out_facts
+           in
+           -- trace ("analysing: " ++ show (entryLabel blk)) $
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           loop todo' fbase'
+
+
+-- | fixpointing for combined analysis/rewriting
+--
+fixpoint :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> UniqSM (DG f n C C, Fact C f))
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> (Fact C f -> UniqSM (DG f n C C, Fact C f))
+
+fixpoint direction DataflowLattice{ fact_bot = _, fact_join = join }
+         do_block entries blockmap init_fbase
+  = do
+        -- trace ("fixpoint: " ++ show (case direction of Fwd -> True; Bwd -> False) ++ " " ++ show (mapKeys blockmap) ++ show entries ++ " " ++ show (mapKeys init_fbase)) $ return()
+        (fbase, newblocks) <- loop start init_fbase mapEmpty
+        -- trace ("fixpoint DONE: " ++ show (mapKeys fbase) ++ show (mapKeys newblocks)) $ return()
+        return (GMany NothingO newblocks NothingO,
+                mapDeleteList (mapKeys blockmap) fbase)
+    -- The successors of the Graph are the the Labels
+    -- for which we have facts and which are *not* in
+    -- the blocks of the graph
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> LabelMap (DBlock f n C C)  -- transformed graph
+       -> UniqSM (FactBase f, LabelMap (DBlock f n C C))
+
+    loop [] fbase newblocks = return (fbase, newblocks)
+    loop (ix:todo) fbase !newblocks = do
+           let blk = block_arr ! ix
+
+           -- trace ("analysing: " ++ show (entryLabel blk)) $ return ()
+           (rg, out_facts) <- do_block blk fbase
+           let !(todo', fbase') =
+                  mapFoldWithKey (updateFact join dep_blocks)
+                                 (todo,fbase) out_facts
+
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           let newblocks' = case rg of
+                              GMany _ blks _ -> mapUnion blks newblocks
+     
+           loop todo' fbase' newblocks'
+
+
+{-  Note [TxFactBase invariants]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The TxFactBase is used only during a fixpoint iteration (or "sweep"),
+and accumulates facts (and the transformed code) during the fixpoint
+iteration.
+
+* tfb_fbase increases monotonically, across all sweeps
+
+* At the beginning of each sweep
+      tfb_cha  = NoChange
+      tfb_lbls = {}
+
+* During each sweep we process each block in turn.  Processing a block
+  is done thus:
+    1.  Read from tfb_fbase the facts for its entry label (forward)
+        or successors labels (backward)
+    2.  Transform those facts into new facts for its successors (forward)
+        or entry label (backward)
+    3.  Augment tfb_fbase with that info
+  We call the labels read in step (1) the "in-labels" of the sweep
+
+* The field tfb_lbls is the set of in-labels of all blocks that have
+  been processed so far this sweep, including the block that is
+  currently being processed.  tfb_lbls is initialised to {}.  It is a
+  subset of the Labels of the *original* (not transformed) blocks.
+
+* The tfb_cha field is set to SomeChange iff we decide we need to
+  perform another iteration of the fixpoint loop. It is initialsed to NoChange.
+
+  Specifically, we set tfb_cha to SomeChange in step (3) iff
+    (a) The fact in tfb_fbase for a block L changes
+    (b) L is in tfb_lbls
+  Reason: until a label enters the in-labels its accumuated fact in tfb_fbase
+  has not been read, hence cannot affect the outcome
+
+Note [Unreachable blocks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+A block that is not in the domain of tfb_fbase is "currently unreachable".
+A currently-unreachable block is not even analyzed.  Reason: consider 
+constant prop and this graph, with entry point L1:
+  L1: x:=3; goto L4
+  L2: x:=4; goto L4
+  L4: if x>3 goto L2 else goto L5
+Here L2 is actually unreachable, but if we process it with bottom input fact,
+we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
+
+* If a currently-unreachable block is not analyzed, then its rewritten
+  graph will not be accumulated in tfb_rg.  And that is good:
+  unreachable blocks simply do not appear in the output.
+
+* Note that clients must be careful to provide a fact (even if bottom)
+  for each entry point. Otherwise useful blocks may be garbage collected.
+
+* Note that updateFact must set the change-flag if a label goes from
+  not-in-fbase to in-fbase, even if its fact is bottom.  In effect the
+  real fact lattice is
+       UNR
+       bottom
+       the points above bottom
+
+* Even if the fact is going from UNR to bottom, we still call the
+  client's fact_join function because it might give the client
+  some useful debugging information.
+
+* All of this only applies for *forward* ixpoints.  For the backward
+  case we must treat every block as reachable; it might finish with a
+  'return', and therefore have no successors, for example.
+-}
+
+
+-----------------------------------------------------------------------------
+--  Pieces that are shared by fixpoint and fixpoint_anal
+-----------------------------------------------------------------------------
+
+-- | Sort the blocks into the right order for analysis.
+sortBlocks :: NonLocal n => Direction -> [Label] -> LabelMap (Block n C C)
+           -> [Block n C C]
+sortBlocks direction entries blockmap
+   = case direction of Fwd -> fwd
+                       Bwd -> reverse fwd
+  where fwd = forwardBlockList entries blockmap
+
+-- | construct a mapping from L -> block indices.  If the fact for L
+-- changes, re-analyse the given blocks.
+mkDepBlocks :: NonLocal n => Direction -> [Block n C C] -> LabelMap [Int]
+mkDepBlocks Fwd blocks = go blocks 0 mapEmpty
+  where go []     !_  m = m
+        go (b:bs) !n m = go bs (n+1) $! mapInsert (entryLabel b) [n] m
+mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
+  where go []     !_ m = m
+        go (b:bs) !n m = go bs (n+1) $! go' (successors b) m
+            where go' [] m = m
+                  go' (l:ls) m = go' ls (mapInsertWith (++) l [n] m)
+
+
+-- | After some new facts have been generated by analysing a block, we
+-- fold this function over them to generate (a) a list of block
+-- indices to (re-)analyse, and (b) the new FactBase.
+--
+updateFact :: JoinFun f -> LabelMap [Int]
+           -> Label -> f       -- out fact
+           -> (IntHeap, FactBase f)
+           -> (IntHeap, FactBase f)
+
+updateFact fact_join dep_blocks lbl new_fact (todo, fbase)
+  = case lookupFact lbl fbase of
+      Nothing       -> let !z = mapInsert lbl new_fact fbase in (changed, z)
+                           -- Note [no old fact]
+      Just old_fact ->
+        case fact_join lbl (OldFact old_fact) (NewFact new_fact) of
+          (NoChange, _) -> (todo, fbase)
+          (_,        f) -> let !z = mapInsert lbl f fbase in (changed, z)
+  where
+     changed = foldr insertIntHeap todo $
+                 mapFindWithDefault [] lbl dep_blocks
+
+{-
+Note [no old fact]
+
+We know that the new_fact is >= _|_, so we don't need to join.  However,
+if the new fact is also _|_, and we have already analysed its block,
+we don't need to record a change.  So there's a tradeoff here.  It turns
+out that always recording a change is faster.
+-}
+
+-----------------------------------------------------------------------------
+--      DG: an internal data type for 'decorated graphs'
+--          TOTALLY internal to Hoopl; each block is decorated with a fact
+-----------------------------------------------------------------------------
+
+type DG f  = Graph' (DBlock f)
+data DBlock f n e x = DBlock f (Block n e x) -- ^ block decorated with fact
+
+instance NonLocal n => NonLocal (DBlock f n) where
+  entryLabel (DBlock _ b) = entryLabel b
+  successors (DBlock _ b) = successors b
+
+--- constructors
+
+dgnil  :: DG f n O O
+dgnilC :: DG f n C C
+dgSplice  :: NonLocal n => DG f n e a -> DG f n a x -> DG f n e x
+
+---- observers
+
+normalizeGraph :: forall n f e x .
+                  NonLocal n => DG f n e x
+               -> (Graph n e x, FactBase f)
+                 -- A Graph together with the facts for that graph
+                 -- The domains of the two maps should be identical
+
+normalizeGraph g = (mapGraphBlocks dropFact g, facts g)
+    where dropFact :: DBlock t t1 t2 t3 -> Block t1 t2 t3
+          dropFact (DBlock _ b) = b
+          facts :: DG f n e x -> FactBase f
+          facts GNil = noFacts
+          facts (GUnit _) = noFacts
+          facts (GMany _ body exit) = bodyFacts body `mapUnion` exitFacts exit
+          exitFacts :: MaybeO x (DBlock f n C O) -> FactBase f
+          exitFacts NothingO = noFacts
+          exitFacts (JustO (DBlock f b)) = mapSingleton (entryLabel b) f
+          bodyFacts :: LabelMap (DBlock f n C C) -> FactBase f
+          bodyFacts body = mapFoldWithKey f noFacts body
+            where f :: forall t a x. (NonLocal t) => Label -> DBlock a t C x -> LabelMap a -> LabelMap a
+                  f lbl (DBlock f _) fb = mapInsert lbl f fb
+
+--- implementation of the constructors (boring)
+
+dgnil  = GNil
+dgnilC = GMany NothingO emptyBody NothingO
+
+dgSplice = splice fzCat
+  where fzCat :: DBlock f n e O -> DBlock t n O x -> DBlock f n e x
+        fzCat (DBlock f b1) (DBlock _ b2) = DBlock f $! b1 `blockAppend` b2
+        -- NB. strictness, this function is hammered.
+
+----------------------------------------------------------------
+--       Utilities
+----------------------------------------------------------------
+
+-- Lifting based on shape:
+--  - from nodes to blocks
+--  - from facts to fact-like things
+-- Lowering back:
+--  - from fact-like things to facts
+-- Note that the latter two functions depend only on the entry shape.
+class ShapeLifter e x where
+ singletonDG   :: f -> n e x -> DG f n e x
+ fwdEntryFact  :: NonLocal n => n e x -> f -> Fact e f
+ fwdEntryLabel :: NonLocal n => n e x -> MaybeC e [Label]
+ ftransfer :: FwdTransfer n f -> n e x -> f -> Fact x f
+ frewrite  :: FwdRewrite m n f -> n e x
+           -> f -> m (Maybe (Graph n e x, FwdRewrite m n f))
+-- @ end node.tex
+ bwdEntryFact :: NonLocal n => DataflowLattice f -> n e x -> Fact e f -> f
+ btransfer    :: BwdTransfer n f -> n e x -> Fact x f -> f
+ brewrite     :: BwdRewrite m n f -> n e x
+              -> Fact x f -> m (Maybe (Graph n e x, BwdRewrite m n f))
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+  fwdEntryFact     n f  = mapSingleton (entryLabel n) f
+  bwdEntryFact lat n fb = getFact lat (entryLabel n) fb
+  ftransfer (FwdTransfer3 (ft, _, _)) n f = ft n f
+  btransfer (BwdTransfer3 (bt, _, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (fr, _, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (br, _, _)) n f = br n f
+  fwdEntryLabel n = JustC [entryLabel n]
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, ft, _)) n f = ft n f
+  btransfer (BwdTransfer3 (_, bt, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, fr, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, br, _)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, _, ft)) n f = ft n f
+  btransfer (BwdTransfer3 (_, _, bt)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, _, fr)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, _, br)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+{-
+class ShapeLifter e x where
+  singletonDG   :: f -> n e x -> DG f n e x
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+-}
+
+-- Fact lookup: the fact `orelse` bottom
+getFact  :: DataflowLattice f -> Label -> FactBase f -> f
+getFact lat l fb = case lookupFact l fb of Just  f -> f
+                                           Nothing -> fact_bot lat
+
+
+
+{-  Note [Respects fuel]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-}
+-- $fuel
+-- A value of type 'FwdRewrite' or 'BwdRewrite' /respects fuel/ if 
+-- any function contained within the value satisfies the following properties:
+--
+--   * When fuel is exhausted, it always returns 'Nothing'.
+--
+--   * When it returns @Just g rw@, it consumes /exactly/ one unit
+--     of fuel, and new rewrite 'rw' also respects fuel.
+--
+-- Provided that functions passed to 'mkFRewrite', 'mkFRewrite3', 
+-- 'mkBRewrite', and 'mkBRewrite3' are not aware of the fuel supply,
+-- the results respect fuel.
+--
+-- It is an /unchecked/ run-time error for the argument passed to 'wrapFR',
+-- 'wrapFR2', 'wrapBR', or 'warpBR2' to return a function that does not respect fuel.
+
+-- -----------------------------------------------------------------------------
+-- a Heap of Int
+
+-- We should really use a proper Heap here, but my attempts to make
+-- one have not succeeded in beating the simple ordered list.  Another
+-- alternative is IntSet (using deleteFindMin), but that was also
+-- slower than the ordered list in my experiments --SDM 25/1/2012
+
+type IntHeap = [Int] -- ordered
+
+insertIntHeap :: Int -> [Int] -> [Int]
+insertIntHeap x [] = [x]
+insertIntHeap x (y:ys)
+  | x < y     = x : y : ys
+  | x == y    = x : ys
+  | otherwise = y : insertIntHeap x ys
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 04586b1029..443fa3a441 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -1,29 +1,19 @@
 {-# LANGUAGE GADTs #-}
 
--- ToDo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
--- ToDo: remove -fno-warn-incomplete-patterns
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
--- Module for building CmmAGraphs.
-
--- As the CmmAGraph is a wrapper over Graph CmmNode O x, it is different
--- from Hoopl's AGraph. The current clients expect functions with the
--- same names Hoopl uses, so this module cannot be in the same namespace
--- as Compiler.Hoopl.
-
 module MkGraph
-  ( CmmAGraph
-  , emptyAGraph, (<*>), catAGraphs, outOfLine
-  , mkLabel, mkMiddle, mkLast
-  , withFreshLabel, withUnique, lgraphOfAGraph, labelAGraph
+  ( CmmAGraph, CgStmt(..)
+  , (<*>), catAGraphs
+  , mkLabel, mkMiddle, mkLast, outOfLine
+  , lgraphOfAGraph, labelAGraph
 
   , stackStubExpr
-  , mkNop, mkAssign, mkStore, mkCall, mkCmmCall, mkSafeCall, mkUnsafeCall, mkFinalCall
-         , mkJump, mkDirectJump, mkForeignJump, mkJumpGC, mkCbranch, mkSwitch
-         , mkReturn, mkReturnSimple, mkComment, mkCallEntry
-         , mkBranch, mkCmmIfThenElse, mkCmmIfThen, mkCmmWhileDo
-         , copyInOflow, copyInSlot, copyOutOflow, copyOutSlot
+  , mkNop, mkAssign, mkStore, mkUnsafeCall, mkFinalCall, mkCallReturnsTo
+  , mkJump, mkDirectJump, mkForeignJump, mkForeignJumpExtra, mkJumpGC
+  , mkCbranch, mkSwitch
+  , mkReturn, mkReturnSimple, mkComment, mkCallEntry, mkBranch
+  , copyInOflow, copyOutOflow
+  , noExtraStack
+  , toCall, Transfer(..)
   )
 where
 
@@ -31,250 +21,232 @@ import BlockId
 import Cmm
 import CmmCallConv (assignArgumentsPos, ParamLocation(..))
 
+
 import Compiler.Hoopl hiding (Unique, (<*>), mkFirst, mkMiddle, mkLast, mkLabel, mkBranch, Shape(..))
-import qualified Compiler.Hoopl as H
-import Compiler.Hoopl.GHC (uniqueToLbl)
 import FastString
 import ForeignCall
 import Outputable
 import Prelude hiding (succ)
 import SMRep (ByteOff)
-import StaticFlags
-import Unique
 import UniqSupply
-import Util
+import OrdList
 
 #include "HsVersions.h"
 
-{-
-A 'CmmAGraph' is an abstract version of a 'Graph CmmNode O x' from module
-'Cmm'.  The difference is that the 'CmmAGraph' can be eigher open of closed at
-exit and it can supply fresh Labels and Uniques.
-
-It also supports a splicing operation <*>, which is different from the Hoopl's
-<*>, because it splices two CmmAGraphs. Specifically, it can splice Graph
-O C and Graph O x. In this case, the open beginning of the second graph is
-thrown away.  In the debug mode this sequence is checked to be empty or
-containing a branch (see note [Branch follows branch]).
-
-When an CmmAGraph open at exit is being converted to a CmmGraph, the output
-exit sequence is considered unreachable. If the graph consist of one block
-only, if it not the case and we crash. Otherwise we just throw the exit
-sequence away (and in debug mode we test that it really was unreachable).
--}
-
-{-
-Node [Branch follows branch]
-============================
-Why do we say it's ok for a Branch to follow a Branch?
-Because the standard constructor mkLabel has fall-through
-semantics. So if you do a mkLabel, you finish the current block,
-giving it a label, and start a new one that branches to that label.
-Emitting a Branch at this point is fine:
-       goto L1; L2: ...stuff...
--}
-
-data CmmGraphOC = Opened (Graph CmmNode O O)
-                | Closed (Graph CmmNode O C)
-type CmmAGraph = UniqSM CmmGraphOC     -- Graph open at entry
-
-{-
-MS: I began with
-  newtype CmmAGraph = forall x. AG (UniqSM (Graph CmmNode O x))
-but that does not work well, because we cannot take the graph
-out of the monad -- we do not know the type of what we would take
-out and pattern matching does not help, as we cannot pattern match
-on a graph inside the monad.
--}
 
-data Transfer = Call | Jump | Ret deriving Eq
+-----------------------------------------------------------------------------
+-- Building Graphs
+
+
+-- | CmmAGraph is a chunk of code consisting of:
+--
+--   * ordinary statements (assignments, stores etc.)
+--   * jumps
+--   * labels
+--   * out-of-line labelled blocks
+--
+-- The semantics is that control falls through labels and out-of-line
+-- blocks.  Everything after a jump up to the next label is by
+-- definition unreachable code, and will be discarded.
+--
+-- Two CmmAGraphs can be stuck together with <*>, with the meaning that
+-- control flows from the first to the second.
+--
+-- A 'CmmAGraph' can be turned into a 'CmmGraph' (closed at both ends)
+-- by providing a label for the entry point; see 'labelAGraph'.
+--
+type CmmAGraph = OrdList CgStmt
+
+data CgStmt
+  = CgLabel BlockId
+  | CgStmt  (CmmNode O O)
+  | CgLast  (CmmNode O C)
+  | CgFork  BlockId CmmAGraph
+
+flattenCmmAGraph :: BlockId -> CmmAGraph -> CmmGraph
+flattenCmmAGraph id stmts =
+    CmmGraph { g_entry = id,
+               g_graph = GMany NothingO body NothingO }
+  where
+  (block, blocks) = flatten (fromOL stmts)
+  entry = blockJoinHead (CmmEntry id) block
+  body = foldr addBlock emptyBody (entry:blocks)
+
+  flatten :: [CgStmt] -> (Block CmmNode O C, [Block CmmNode C C])
+  flatten [] = panic "flatten []"
+
+  -- A label at the end of a function or fork: this label must not be reachable,
+  -- but it might be referred to from another BB that also isn't reachable.
+  -- Eliminating these has to be done with a dead-code analysis.  For now,
+  -- we just make it into a well-formed block by adding a recursive jump.
+  flatten [CgLabel id]
+    = (goto_id, [blockJoinHead (CmmEntry id) goto_id] )
+    where goto_id = blockJoinTail emptyBlock (CmmBranch id)
+
+  -- A jump/branch: throw away all the code up to the next label, because
+  -- it is unreachable.  Be careful to keep forks that we find on the way.
+  flatten (CgLast stmt : stmts)
+    = case dropWhile isOrdinaryStmt stmts of
+        [] ->
+            ( sing, [] )
+        [CgLabel id] ->
+            ( sing, [blockJoin (CmmEntry id) emptyBlock (CmmBranch id)] )
+        (CgLabel id : stmts) ->
+            ( sing, blockJoinHead (CmmEntry id) block : blocks )
+            where (block,blocks) = flatten stmts
+        (CgFork fork_id stmts : ss) -> 
+            flatten (CgFork fork_id stmts : CgLast stmt : ss)
+        _ -> panic "MkGraph.flatten"
+    where
+      sing = blockJoinTail emptyBlock stmt
+
+  flatten (s:ss) = 
+        case s of
+          CgStmt stmt -> (blockCons stmt block, blocks)
+          CgLabel id  -> (blockJoinTail emptyBlock (CmmBranch id),
+                          blockJoinHead (CmmEntry id) block : blocks)
+          CgFork fork_id stmts -> 
+                (block, blockJoinHead (CmmEntry fork_id) fork_block : fork_blocks ++ blocks)
+                where (fork_block, fork_blocks) = flatten (fromOL stmts)
+          _ -> panic "MkGraph.flatten"
+    where (block,blocks) = flatten ss
+
+isOrdinaryStmt :: CgStmt -> Bool
+isOrdinaryStmt (CgStmt _) = True
+isOrdinaryStmt (CgLast _) = True
+isOrdinaryStmt _          = False
+
+
 
 ---------- AGraph manipulation
 
-emptyAGraph    :: CmmAGraph
 (<*>)          :: CmmAGraph -> CmmAGraph -> CmmAGraph
+(<*>)           = appOL
+
 catAGraphs     :: [CmmAGraph] -> CmmAGraph
+catAGraphs      = concatOL
+
+-- | created a sequence "goto id; id:" as an AGraph
+mkLabel        :: BlockId -> CmmAGraph
+mkLabel bid     = unitOL (CgLabel bid)
 
-mkLabel        :: BlockId     -> CmmAGraph  -- created a sequence "goto id; id:" as an AGraph
-mkMiddle       :: CmmNode O O -> CmmAGraph  -- creates an open AGraph from a given node
-mkLast         :: CmmNode O C -> CmmAGraph  -- created a closed AGraph from a given node
+-- | creates an open AGraph from a given node
+mkMiddle        :: CmmNode O O -> CmmAGraph
+mkMiddle middle = unitOL (CgStmt middle)
 
-withFreshLabel :: String -> (BlockId -> CmmAGraph) -> CmmAGraph
-withUnique     :: (Unique -> CmmAGraph) -> CmmAGraph
+-- | created a closed AGraph from a given node
+mkLast         :: CmmNode O C -> CmmAGraph
+mkLast last     = unitOL (CgLast last)
 
+-- | A labelled code block; should end in a last node
+outOfLine      :: BlockId -> CmmAGraph -> CmmAGraph
+outOfLine l g   = unitOL (CgFork l g)
+
+-- | allocate a fresh label for the entry point
 lgraphOfAGraph :: CmmAGraph -> UniqSM CmmGraph
-  -- ^ allocate a fresh label for the entry point
+lgraphOfAGraph g = do u <- getUniqueM
+                      return (flattenCmmAGraph (mkBlockId u) g)
+
+-- | use the given BlockId as the label of the entry point
 labelAGraph    :: BlockId -> CmmAGraph -> UniqSM CmmGraph
-  -- ^ use the given BlockId as the label of the entry point
+labelAGraph lbl ag = return (flattenCmmAGraph lbl ag)
 
 ---------- No-ops
 mkNop        :: CmmAGraph
+mkNop         = nilOL
+
 mkComment    :: FastString -> CmmAGraph
+#ifdef DEBUG
+-- SDM: generating all those comments takes time, this saved about 4% for me
+mkComment fs  = mkMiddle $ CmmComment fs
+#else
+mkComment _   = nilOL
+#endif
 
 ---------- Assignment and store
 mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
-mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkAssign l r  = mkMiddle $ CmmAssign l r
 
----------- Calls
-mkCall       :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-mkCmmCall    :: CmmExpr ->              [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-  -- Native C-- calling convention
-mkSafeCall    :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset -> Bool -> CmmAGraph
-mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
-mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-  -- Never returns; like exit() or barf()
+mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkStore  l r  = mkMiddle $ CmmStore  l r
 
 ---------- Control transfer
-mkJump          ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkDirectJump    ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkJumpGC        ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkCbranch       :: CmmExpr -> BlockId -> BlockId          -> CmmAGraph
-mkSwitch        :: CmmExpr -> [Maybe BlockId]             -> CmmAGraph
+mkJump          :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeNodeCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkDirectJump    :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkDirectJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeDirectCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkJumpGC        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJumpGC e actuals updfr_off =
+  lastWithArgs Jump Old GC actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+                -> CmmAGraph
+mkForeignJump conv e actuals updfr_off =
+  mkForeignJumpExtra conv e actuals updfr_off noExtraStack
+
+mkForeignJumpExtra :: Convention -> CmmExpr -> [CmmActual]
+                -> UpdFrameOffset -> (ByteOff, [(CmmExpr, ByteOff)])
+                -> CmmAGraph
+mkForeignJumpExtra conv e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack Jump Old conv actuals updfr_off extra_stack $
+    toCall e Nothing updfr_off 0
+
+mkCbranch       :: CmmExpr -> BlockId -> BlockId -> CmmAGraph
+mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
+
+mkSwitch        :: CmmExpr -> [Maybe BlockId] -> CmmAGraph
+mkSwitch e tbl   = mkLast $ CmmSwitch e tbl
+
 mkReturn        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturn e actuals updfr_off =
+  lastWithArgs Ret  Old NativeReturn actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
 mkReturnSimple  :: [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple actuals updfr_off =
+  mkReturn e actuals updfr_off
+  where e = CmmLoad (CmmStackSlot Old updfr_off) gcWord
 
 mkBranch        :: BlockId -> CmmAGraph
-mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> CmmAGraph
-mkCmmIfThen     :: CmmExpr -> CmmAGraph -> CmmAGraph
-mkCmmWhileDo    :: CmmExpr -> CmmAGraph -> CmmAGraph
-
-outOfLine       :: CmmAGraph -> CmmAGraph
--- ^ The argument is an CmmAGraph that must have an
--- empty entry sequence and be closed at the end.
--- The result is a new CmmAGraph that is open at the
--- end and goes directly from entry to exit, with the
--- original graph sitting to the side out-of-line.
---
--- Example:  mkMiddle (x = 3)
---           <*> outOfLine (mkLabel L <*> ...stuff...)
---           <*> mkMiddle (y = x)
--- Control will flow directly from x=3 to y=x;
--- the block starting with L is "on the side".
---
--- N.B. algebraically forall g g' : g <*> outOfLine g' == outOfLine g' <*> g
+mkBranch bid     = mkLast (CmmBranch bid)
+
+mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset
+              -> CmmAGraph
+mkFinalCall f _ actuals updfr_off =
+  lastWithArgs Call Old NativeDirectCall actuals updfr_off $
+    toCall f Nothing updfr_off 0
+
+mkCallReturnsTo :: CmmExpr -> Convention -> [CmmActual]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> (ByteOff, [(CmmExpr,ByteOff)])
+                -> CmmAGraph
+mkCallReturnsTo f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
+  lastWithArgsAndExtraStack Call (Young ret_lbl) callConv actuals
+     updfr_off extra_stack $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
+mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
+
 
 --------------------------------------------------------------------------
 
--- ================ IMPLEMENTATION ================--
-
---------------------------------------------------
--- Raw CmmAGraph handling
-
-emptyAGraph = return $ Opened emptyGraph
-ag <*> ah = do g <- ag
-               h <- ah
-               return (case (g, h) of
-                 (Opened g, Opened h) -> Opened $ g H.<*> h
-                 (Opened g, Closed h) -> Closed $ g H.<*> h
-                 (Closed g, Opened GNil) -> Closed g
-                 (Closed g, Opened (GUnit e)) -> note_unreachable e $ Closed g
-                 (Closed g, Opened (GMany (JustO e) b x)) -> note_unreachable e $ Opened $ g H.|*><*| GMany NothingO b x
-                 (Closed g, Closed (GMany (JustO e) b x)) -> note_unreachable e $ Closed $ g H.|*><*| GMany NothingO b x
-                 :: CmmGraphOC)
-catAGraphs = foldl (<*>) emptyAGraph
-
-outOfLine ag = withFreshLabel "outOfLine" $ \l ->
-               do g <- ag
-                  return (case g of
-                    Closed (GMany (JustO e) b _) -> note_unreachable e $ Opened $
-                                                      GMany (JustO $ BLast $ CmmBranch l) b (JustO $ BFirst $ CmmEntry l)
-                    _                            -> panic "outOfLine"
-                    :: CmmGraphOC)
-
-note_unreachable :: Block CmmNode O x -> a -> a
-note_unreachable block graph =
-  ASSERT (block_is_empty_or_label)  -- Note [Branch follows branch]
-  graph
-  where block_is_empty_or_label :: Bool
-        block_is_empty_or_label = case blockToNodeList block of
-                                    (NothingC, [], NothingC)            -> True
-                                    (NothingC, [], JustC (CmmBranch _)) -> True
-                                    _                                   -> False
-
-mkLabel bid = return $ Opened $ H.mkLast (CmmBranch bid) |*><*| H.mkFirst (CmmEntry bid)
-mkMiddle middle = return $ Opened $ H.mkMiddle middle
-mkLast last = return $ Closed $ H.mkLast last
-
-withUnique f = getUniqueM >>= f
-withFreshLabel _name f = getUniqueM >>= f . uniqueToLbl . intToUnique . getKey
 
-lgraphOfAGraph g = do u <- getUniqueM
-                      labelAGraph (mkBlockId u) g
-
-labelAGraph lbl ag = do g <- ag
-                        return $ CmmGraph {g_entry=lbl, g_graph=H.mkFirst (CmmEntry lbl) H.<*> closed g}
-  where closed :: CmmGraphOC -> Graph CmmNode O C
-        closed (Closed g) = g
-        closed (Opened g@(GMany entry body (JustO exit))) =
-          ASSERT (entryLabel exit `notElem` map entryLabel (postorder_dfs g))
-          GMany entry body NothingO
-        closed (Opened _) = panic "labelAGraph"
-
---------------------------------------------------
--- CmmAGraph constructions
-
-mkNop                     = emptyAGraph
-mkComment fs              = mkMiddle $ CmmComment fs
-mkStore  l r              = mkMiddle $ CmmStore  l r
-
--- NEED A COMPILER-DEBUGGING FLAG HERE
--- Sanity check: any value assigned to a pointer must be non-zero.
--- If it's 0, cause a crash immediately.
-mkAssign l r = if opt_StubDeadValues then assign l r <*> check l else assign l r
-  where assign l r = mkMiddle (CmmAssign l r)
-        check (CmmGlobal _) = mkNop
-        check l@(CmmLocal reg) = -- if a ptr arg is NULL, cause a crash!
-          if isGcPtrType ty then
-            mkCmmIfThen (CmmMachOp (MO_Eq w) [r, stackStubExpr w])
-                        (assign l (CmmLoad (CmmLit (CmmInt 0 w)) ty))
-          else mkNop
-            where ty = localRegType reg
-                  w  = typeWidth ty
-                  r  = CmmReg l
 
 
 -- Why are we inserting extra blocks that simply branch to the successors?
 -- Because in addition to the branch instruction, @mkBranch@ will insert
 -- a necessary adjustment to the stack pointer.
-mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
-mkSwitch e tbl            = mkLast $ CmmSwitch e tbl
-
-mkSafeCall   t fs as upd i = withFreshLabel "safe call" $ body
-  where
-    body k =
-     (    mkStore (CmmStackSlot (CallArea (Young k)) (widthInBytes wordWidth))
-                  (CmmLit (CmmBlock k))
-      <*> mkLast (CmmForeignCall {tgt=t, res=fs, args=as, succ=k, updfr=upd, intrbl=i})
-      <*> mkLabel k)
-mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
 
-mkBranch bid = mkLast (CmmBranch bid)
-
-mkCmmIfThenElse e tbranch fbranch =
-  withFreshLabel "end of if"     $ \endif ->
-  withFreshLabel "start of then" $ \tid ->
-  withFreshLabel "start of else" $ \fid ->
-    mkCbranch e tid fid <*>
-    mkLabel tid <*> tbranch <*> mkBranch endif <*>
-    mkLabel fid <*> fbranch <*> mkLabel endif
-
-mkCmmIfThen e tbranch
-  = withFreshLabel "end of if"     $ \endif ->
-    withFreshLabel "start of then" $ \tid ->
-      mkCbranch e tid endif <*>
-      mkLabel tid <*> tbranch <*> mkLabel endif
-
-mkCmmWhileDo e body =
-  withFreshLabel "loop test" $ \test ->
-  withFreshLabel "loop head" $ \head ->
-  withFreshLabel "end while" $ \endwhile ->
-    -- Forrest Baskett's while-loop layout
-    mkBranch test <*> mkLabel head <*> body
-                  <*> mkLabel test <*> mkCbranch e head endwhile
-                  <*> mkLabel endwhile
 
 -- For debugging purposes, we can stub out dead stack slots:
 stackStubExpr :: Width -> CmmExpr
@@ -286,12 +258,9 @@ stackStubExpr w = CmmLit (CmmInt 0 w)
 -- Therefore, for copying arguments and results, we provide different
 -- functions to pass the arguments in an overflow area and to pass them in spill slots.
 copyInOflow  :: Convention -> Area -> [CmmFormal] -> (Int, CmmAGraph)
-copyInSlot   :: Convention -> [CmmFormal] -> [CmmNode O O]
-copyOutSlot  :: Convention -> [LocalReg] -> [CmmNode O O]
 
 copyInOflow conv area formals = (offset, catAGraphs $ map mkMiddle nodes)
   where (offset, nodes) = copyIn oneCopyOflowI conv area formals
-copyInSlot c f = snd $ copyIn oneCopySlotI c (panic "no area for copying to slots") f
 
 type SlotCopier = Area -> (LocalReg, ByteOff) -> (ByteOff, [CmmNode O O]) ->
                           (ByteOff, [CmmNode O O])
@@ -312,104 +281,91 @@ copyIn oflow conv area formals =
                 adjust rst x@(_, RegisterParam _) = x : rst
 
 -- Copy-in one arg, using overflow space if needed.
-oneCopyOflowI, oneCopySlotI :: SlotCopier
+oneCopyOflowI :: SlotCopier
 oneCopyOflowI area (reg, off) (n, ms) =
   (max n off, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty) : ms)
   where ty = localRegType reg
 
--- Copy-in one arg, using spill slots if needed -- used for calling conventions at
--- a procpoint that is not a return point. The offset is irrelevant here...
-oneCopySlotI _ (reg, _) (n, ms) =
-  (n, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg) w) ty) : ms)
-  where ty = localRegType reg
-        w  = widthInBytes (typeWidth ty)
-
-
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
 
-copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual] -> UpdFrameOffset ->
-                              (Int, CmmAGraph)
+data Transfer = Call | Jump | Ret deriving Eq
+
+copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff, [(CmmExpr,ByteOff)]) -- extra stack stuff
+             -> (Int, [GlobalReg], CmmAGraph)
+
 -- Generate code to move the actual parameters into the locations
--- required by the calling convention.  This includes a store for the return address.
+-- required by the calling convention.  This includes a store for the
+-- return address.
 --
--- The argument layout function ignores the pointer to the info table, so we slot that
--- in here. When copying-out to a young area, we set the info table for return
--- and adjust the offsets of the other parameters.
--- If this is a call instruction, we adjust the offsets of the other parameters.
-copyOutOflow conv transfer area@(CallArea a) actuals updfr_off
-  = foldr co (init_offset, emptyAGraph) args'
+-- The argument layout function ignores the pointer to the info table,
+-- so we slot that in here. When copying-out to a young area, we set
+-- the info table for return and adjust the offsets of the other
+-- parameters.  If this is a call instruction, we adjust the offsets
+-- of the other parameters.
+copyOutOflow conv transfer area actuals updfr_off
+  (extra_stack_off, extra_stack_stuff)
+  = foldr co (init_offset, [], mkNop) (args' ++ stack_params)
   where 
-    co (v, RegisterParam r) (n, ms) = (n, mkAssign (CmmGlobal r) v <*> ms)
-    co (v, StackParam off)  (n, ms) = (max n off, mkStore (CmmStackSlot area off) v <*> ms)
+    co (v, RegisterParam r) (n, rs, ms)
+       = (n, r:rs, mkAssign (CmmGlobal r) v <*> ms)
+    co (v, StackParam off)  (n, rs, ms)
+       = (max n off, rs, mkStore (CmmStackSlot area off) v <*> ms)
+
+    stack_params = [ (e, StackParam (off + init_offset))
+                   | (e,off) <- extra_stack_stuff ]
 
     (setRA, init_offset) =
-      case a of Young id -> id `seq` -- Generate a store instruction for
-                                     -- the return address if making a call
+      case area of
+            Young id -> id `seq` -- Generate a store instruction for
+                                 -- the return address if making a call
                   if transfer == Call then
                     ([(CmmLit (CmmBlock id), StackParam init_offset)],
                      widthInBytes wordWidth)
                   else ([], 0)
-                Old -> ([], updfr_off)
+            Old -> ([], updfr_off)
+
+    arg_offset = init_offset + extra_stack_off
 
     args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
     args = assignArgumentsPos conv cmmExprType actuals
 
     args' = foldl adjust setRA args
-      where adjust rst   (v, StackParam off)  = (v, StackParam (off + init_offset)) : rst
+      where adjust rst   (v, StackParam off)  = (v, StackParam (off + arg_offset)) : rst
             adjust rst x@(_, RegisterParam _) = x : rst
 
-copyOutOflow _ _ (RegSlot _) _ _ = panic "cannot copy arguments into a register slot"
 
--- Args passed only in registers and stack slots; no overflow space.
--- No return address may apply!
-copyOutSlot conv actuals = foldr co [] args
-  where co (v, RegisterParam r) ms = CmmAssign (CmmGlobal r) (toExp v) : ms
-        co (v, StackParam off)  ms = CmmStore  (CmmStackSlot (RegSlot v) off) (toExp v) : ms
-        toExp r = CmmReg (CmmLocal r)
-        args = assignArgumentsPos conv localRegType actuals
 
 mkCallEntry :: Convention -> [CmmFormal] -> (Int, CmmAGraph)
-mkCallEntry conv formals = copyInOflow conv (CallArea Old) formals
+mkCallEntry conv formals = copyInOflow conv Old formals
 
-lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual] -> UpdFrameOffset ->
-                (ByteOff -> CmmAGraph) -> CmmAGraph
+lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
 lastWithArgs transfer area conv actuals updfr_off last =
-  let (outArgs, copies) = copyOutOflow conv transfer area actuals updfr_off in
-  copies <*> last outArgs
-
--- The area created for the jump and return arguments is the same area as the
--- procedure entry.
-old :: Area
-old = CallArea Old
-toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff -> CmmAGraph
-toCall e cont updfr_off res_space arg_space =
-  mkLast $ CmmCall e cont arg_space res_space updfr_off
-mkJump e actuals updfr_off =
-  lastWithArgs Jump old NativeNodeCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkDirectJump e actuals updfr_off =
-  lastWithArgs Jump old NativeDirectCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkJumpGC e actuals updfr_off =
-  lastWithArgs Jump old GC actuals updfr_off $ toCall e Nothing updfr_off 0
-mkForeignJump conv e actuals updfr_off =
-  lastWithArgs Jump old conv actuals updfr_off $ toCall e Nothing updfr_off 0
-mkReturn e actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    -- where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-mkReturnSimple actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-
-mkFinalCall f _ actuals updfr_off =
-  lastWithArgs Call old NativeDirectCall actuals updfr_off $ toCall f Nothing updfr_off 0
-
-mkCmmCall f results actuals = mkCall f (NativeDirectCall, NativeReturn) results actuals
-
--- I'm dropping the SRT, but that should be okay: we plan to reconstruct it later.
-mkCall f (callConv, retConv) results actuals updfr_off =
-  withFreshLabel "call successor" $ \k ->
-    let area = CallArea $ Young k
-        (off, copyin) = copyInOflow retConv area results
-        copyout = lastWithArgs Call area callConv actuals updfr_off 
-                               (toCall f (Just k) updfr_off off)
-    in (copyout <*> mkLabel k <*> copyin)
+  lastWithArgsAndExtraStack transfer area conv actuals
+                            updfr_off noExtraStack last
+
+lastWithArgsAndExtraStack :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset -> (ByteOff, [(CmmExpr,ByteOff)])
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgsAndExtraStack transfer area conv actuals updfr_off
+                          extra_stack last =
+  copies <*> last outArgs regs
+ where
+  (outArgs, regs, copies) = copyOutOflow conv transfer area actuals
+                               updfr_off extra_stack
+
+
+noExtraStack :: (ByteOff, [(CmmExpr,ByteOff)])
+noExtraStack = (0,[])
+
+toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff
+       -> ByteOff -> [GlobalReg]
+       -> CmmAGraph
+toCall e cont updfr_off res_space arg_space regs =
+  mkLast $ CmmCall e cont regs arg_space res_space updfr_off
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index fc4706c8c4..aa83afbf8d 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -9,9 +9,7 @@
 module OldCmm (
         CmmGroup, GenCmmGroup, RawCmmGroup, CmmDecl, RawCmmDecl,
         ListGraph(..),
-
-        CmmInfo(..), CmmInfoTable(..), ClosureTypeInfo(..), UpdateFrame(..),
-
+        CmmInfoTable(..), ClosureTypeInfo(..),
         CmmStatic(..), CmmStatics(..), CmmFormal, CmmActual,
 
         cmmMapGraph, cmmTopMapGraph,
@@ -49,24 +47,6 @@ import ForeignCall
 -- with assembly-language labels.
 
 -----------------------------------------------------------------------------
---     Info Tables
------------------------------------------------------------------------------
-
-data CmmInfo
-  = CmmInfo
-        (Maybe BlockId)     -- GC target. Nothing <=> CPS won't do stack check
-                            -- JD: NOT USED BY NEW CODE GEN
-        (Maybe UpdateFrame) -- Update frame
-        CmmInfoTable        -- Info table
-
--- | A frame that is to be pushed before entry to the function.
--- Used to handle 'update' frames.
-data UpdateFrame
-  = UpdateFrame
-        CmmExpr    -- Frame header.  Behaves like the target of a 'jump'.
-        [CmmExpr]  -- Frame remainder.  Behaves like the arguments of a 'jump'.
-
------------------------------------------------------------------------------
 --  Cmm, CmmDecl, CmmBasicBlock
 -----------------------------------------------------------------------------
 
@@ -85,8 +65,8 @@ data UpdateFrame
 newtype ListGraph i = ListGraph [GenBasicBlock i]
 
 -- | Cmm with the info table as a data type
-type CmmGroup = GenCmmGroup CmmStatics CmmInfo (ListGraph CmmStmt)
-type CmmDecl = GenCmmDecl CmmStatics CmmInfo (ListGraph CmmStmt)
+type CmmGroup = GenCmmGroup CmmStatics CmmInfoTable (ListGraph CmmStmt)
+type CmmDecl = GenCmmDecl CmmStatics CmmInfoTable (ListGraph CmmStmt)
 
 -- | Cmm with the info tables converted to a list of 'CmmStatic' along with the info
 -- table label. If we are building without tables-next-to-code there will be no statics
@@ -225,16 +205,9 @@ instance UserOfLocalRegs CmmCallTarget where
     foldRegsUsed f set (CmmCallee e _)    = foldRegsUsed f set e
     foldRegsUsed f set (CmmPrim _ mStmts) = foldRegsUsed f set mStmts
 
-instance UserOfSlots CmmCallTarget where
-    foldSlotsUsed f set (CmmCallee e _) = foldSlotsUsed f set e
-    foldSlotsUsed _ set (CmmPrim {})    = set
-
 instance UserOfLocalRegs a => UserOfLocalRegs (CmmHinted a) where
     foldRegsUsed f set a = foldRegsUsed f set (hintlessCmm a)
 
-instance UserOfSlots a => UserOfSlots (CmmHinted a) where
-    foldSlotsUsed f set a = foldSlotsUsed f set (hintlessCmm a)
-
 instance DefinerOfLocalRegs a => DefinerOfLocalRegs (CmmHinted a) where
     foldRegsDefd f set a = foldRegsDefd f set (hintlessCmm a)
 
diff --git a/compiler/cmm/OldCmmLint.hs b/compiler/cmm/OldCmmLint.hs
new file mode 100644
index 0000000000..72e40ce4f8
--- /dev/null
+++ b/compiler/cmm/OldCmmLint.hs
@@ -0,0 +1,209 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-- CmmLint: checking the correctness of Cmm statements and expressions
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS -fno-warn-tabs #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and
+-- detab the module (please do the detabbing in a separate patch). See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
+-- for details
+
+module OldCmmLint (
+  cmmLint, cmmLintTop
+  ) where
+
+import BlockId
+import OldCmm
+import CLabel
+import Outputable
+import OldPprCmm()
+import Constants
+import FastString
+import Platform
+
+import Data.Maybe
+
+-- -----------------------------------------------------------------------------
+-- Exported entry points:
+
+cmmLint :: (Outputable d, Outputable h)
+        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+
+cmmLintTop :: (Outputable d, Outputable h)
+           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+
+runCmmLint :: Outputable a
+           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint _ l p =
+   case unCL (l p) of
+   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                           nest 2 err,
+                           ptext $ sLit ("Program was:"),
+                           nest 2 (ppr p)])
+   Right _  -> Nothing
+
+lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
+lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
+  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
+        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
+	in  mapM_ (lintCmmBlock platform labels) blocks
+
+lintCmmDecl _ (CmmData {})
+  = return ()
+
+lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
+lintCmmBlock platform labels (BasicBlock id stmts)
+  = addLintInfo (text "in basic block " <> ppr id) $
+	mapM_ (lintCmmStmt platform labels) stmts
+
+-- -----------------------------------------------------------------------------
+-- lintCmmExpr
+
+-- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
+-- byte/word mismatches.
+
+lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
+lintCmmExpr platform (CmmLoad expr rep) = do
+  _ <- lintCmmExpr platform expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
+  return rep
+lintCmmExpr platform expr@(CmmMachOp op args) = do
+  tys <- mapM (lintCmmExpr platform) args
+  if map (typeWidth . cmmExprType) args == machOpArgReps op
+  	then cmmCheckMachOp op args tys
+	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr platform (CmmRegOff reg offset)
+  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
+		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+  where rep = typeWidth (cmmRegType reg)
+lintCmmExpr _ expr =
+  return (cmmExprType expr)
+
+-- Check for some common byte/word mismatches (eg. Sp + 1)
+cmmCheckMachOp   :: MachOp -> [CmmExpr] -> [CmmType] -> CmmLint CmmType
+cmmCheckMachOp op [lit@(CmmLit (CmmInt { })), reg@(CmmReg _)] tys
+  = cmmCheckMachOp op [reg, lit] tys
+cmmCheckMachOp op _ tys
+  = return (machOpResultType op tys)
+
+isOffsetOp :: MachOp -> Bool
+isOffsetOp (MO_Add _) = True
+isOffsetOp (MO_Sub _) = True
+isOffsetOp _ = False
+
+-- This expression should be an address from which a word can be loaded:
+-- check for funny-looking sub-word offsets.
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress _
+  = return ()
+
+-- No warnings for unaligned arithmetic with the node register,
+-- which is used to extract fields from tagged constructor closures.
+notNodeReg :: CmmExpr -> Bool
+notNodeReg (CmmReg reg) | reg == nodeReg = False
+notNodeReg _                             = True
+
+lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
+lintCmmStmt platform labels = lint
+    where lint (CmmNop) = return ()
+          lint (CmmComment {}) = return ()
+          lint stmt@(CmmAssign reg expr) = do
+            erep <- lintCmmExpr platform expr
+	    let reg_ty = cmmRegType reg
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr stmt erep reg_ty
+          lint (CmmStore l r) = do
+            _ <- lintCmmExpr platform l
+            _ <- lintCmmExpr platform r
+            return ()
+          lint (CmmCall target _res args _) =
+              do lintTarget platform labels target
+                 mapM_ (lintCmmExpr platform . hintlessCmm) args
+          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
+          lint (CmmSwitch e branches) = do
+            mapM_ checkTarget $ catMaybes branches
+            erep <- lintCmmExpr platform e
+            if (erep `cmmEqType_ignoring_ptrhood` bWord)
+              then return ()
+              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
+                               text " :: " <> ppr erep)
+          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
+          lint (CmmReturn) = return ()
+          lint (CmmBranch id)    = checkTarget id
+          checkTarget id = if setMember id labels then return ()
+                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
+lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
+                                                return ()
+lintTarget _        _      (CmmPrim _ Nothing) = return ()
+lintTarget platform labels (CmmPrim _ (Just stmts))
+    = mapM_ (lintCmmStmt platform labels) stmts
+
+
+checkCond :: CmmExpr -> CmmLint ()
+checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
+checkCond expr
+    = cmmLintErr (hang (text "expression is not a conditional:") 2
+                       (ppr expr))
+
+-- -----------------------------------------------------------------------------
+-- CmmLint monad
+
+-- just a basic error monad:
+
+newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
+
+instance Monad CmmLint where
+  CmmLint m >>= k = CmmLint $ case m of 
+				Left e -> Left e
+				Right a -> unCL (k a)
+  return a = CmmLint (Right a)
+
+cmmLintErr :: SDoc -> CmmLint a
+cmmLintErr msg = CmmLint (Left msg)
+
+addLintInfo :: SDoc -> CmmLint a -> CmmLint a
+addLintInfo info thing = CmmLint $ 
+   case unCL thing of
+	Left err -> Left (hang info 2 err)
+	Right a  -> Right a
+
+cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
+cmmLintMachOpErr expr argsRep opExpectsRep
+     = cmmLintErr (text "in MachOp application: " $$ 
+					nest 2 (ppr expr) $$
+				        (text "op is expecting: " <+> ppr opExpectsRep) $$
+					(text "arguments provide: " <+> ppr argsRep))
+
+cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr stmt e_ty r_ty
+  = cmmLintErr (text "in assignment: " $$ 
+		nest 2 (vcat [ppr stmt, 
+			      text "Reg ty:" <+> ppr r_ty,
+			      text "Rhs ty:" <+> ppr e_ty]))
+			 
+					
+
+cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
+cmmLintDubiousWordOffset expr
+   = cmmLintErr (text "offset is not a multiple of words: " $$
+			nest 2 (ppr expr))
diff --git a/compiler/cmm/OldPprCmm.hs b/compiler/cmm/OldPprCmm.hs
index d6a12221fb..9990fd26a4 100644
--- a/compiler/cmm/OldPprCmm.hs
+++ b/compiler/cmm/OldPprCmm.hs
@@ -63,10 +63,6 @@ instance Outputable instr => Outputable (GenBasicBlock instr) where
 instance Outputable CmmStmt where
     ppr s = pprStmt s
 
-instance Outputable CmmInfo where
-    ppr i = pprInfo i
-
-
 -- --------------------------------------------------------------------------
 instance Outputable CmmSafety where
   ppr CmmUnsafe = ptext (sLit "_unsafe_call_")
@@ -74,21 +70,6 @@ instance Outputable CmmSafety where
   ppr (CmmSafe srt) = ppr srt
 
 -- --------------------------------------------------------------------------
--- Info tables. The current pretty printer needs refinement
--- but will work for now.
---
--- For ideas on how to refine it, they used to be printed in the
--- style of C--'s 'stackdata' declaration, just inside the proc body,
--- and were labelled with the procedure name ++ "_info".
-pprInfo :: CmmInfo -> SDoc
-pprInfo (CmmInfo _gc_target update_frame info_table) =
-    vcat [{-ptext (sLit "gc_target: ") <>
-                maybe (ptext (sLit "<none>")) ppr gc_target,-}
-          ptext (sLit "update_frame: ") <>
-                maybe (ptext (sLit "<none>")) pprUpdateFrame update_frame,
-          ppr info_table]
-
--- --------------------------------------------------------------------------
 -- Basic blocks look like assembly blocks.
 --      lbl: stmt ; stmt ; ..
 pprBBlock :: Outputable stmt => GenBasicBlock stmt -> SDoc
@@ -156,18 +137,6 @@ pprStmt stmt = case stmt of
 instance (Outputable a) => Outputable (CmmHinted a) where
   ppr (CmmHinted a k) = ppr (a, k)
 
-pprUpdateFrame :: UpdateFrame -> SDoc
-pprUpdateFrame (UpdateFrame expr args) =
-    hcat [ ptext (sLit "jump")
-         , space
-         , if isTrivialCmmExpr expr
-                then pprExpr expr
-                else case expr of
-                    CmmLoad (CmmReg _) _ -> pprExpr expr
-                    _ -> parens (pprExpr expr)
-         , space
-         , parens  ( commafy $ map ppr args ) ]
-
 -- --------------------------------------------------------------------------
 -- goto local label. [1], section 6.6
 --
diff --git a/compiler/cmm/OptimizationFuel.hs b/compiler/cmm/OptimizationFuel.hs
deleted file mode 100644
index a85b11bcc6..0000000000
--- a/compiler/cmm/OptimizationFuel.hs
+++ /dev/null
@@ -1,142 +0,0 @@
-{-# LANGUAGE TypeFamilies #-}
--- | Optimisation fuel is used to control the amount of work the optimiser does.
---
--- Every optimisation step consumes a certain amount of fuel and stops when
--- it runs out of fuel.  This can be used e.g. to debug optimiser bugs: Run
--- the optimiser with varying amount of fuel to find out the exact number of
--- steps where a bug is introduced in the output.
-module OptimizationFuel
-    ( OptimizationFuel, amountOfFuel, tankFilledTo, unlimitedFuel, anyFuelLeft, oneLessFuel
-    , OptFuelState, initOptFuelState
-    , FuelConsumer, FuelUsingMonad, FuelState
-    , fuelGet, fuelSet, lastFuelPass, setFuelPass
-    , fuelExhausted, fuelDec1, tryWithFuel
-    , runFuelIO, runInfiniteFuelIO, fuelConsumingPass
-    , FuelUniqSM
-    , liftUniq
-    )
-where
-
-import Data.IORef
-import Control.Monad
-import StaticFlags (opt_Fuel)
-import UniqSupply
-import Panic
-import Util
-
-import Compiler.Hoopl
-import Compiler.Hoopl.GHC (getFuel, setFuel)
-
-#include "HsVersions.h"
-
-
--- We limit the number of transactions executed using a record of flags
--- stored in an HscEnv. The flags store the name of the last optimization
--- pass and the amount of optimization fuel remaining.
-data OptFuelState =
-  OptFuelState { pass_ref :: IORef String
-               , fuel_ref :: IORef OptimizationFuel
-               }
-initOptFuelState :: IO OptFuelState
-initOptFuelState =
-  do pass_ref' <- newIORef "unoptimized program"
-     fuel_ref' <- newIORef (tankFilledTo opt_Fuel)
-     return OptFuelState {pass_ref = pass_ref', fuel_ref = fuel_ref'}
-
-type FuelConsumer a = OptimizationFuel -> (a, OptimizationFuel)
-
-tankFilledTo :: Int -> OptimizationFuel
-amountOfFuel :: OptimizationFuel -> Int
-
-anyFuelLeft :: OptimizationFuel -> Bool
-oneLessFuel :: OptimizationFuel -> OptimizationFuel
-unlimitedFuel :: OptimizationFuel
-
-newtype OptimizationFuel = OptimizationFuel Int
-  deriving Show
-
-tankFilledTo = OptimizationFuel
-amountOfFuel (OptimizationFuel f) = f
-
-anyFuelLeft (OptimizationFuel f) = f > 0
-oneLessFuel (OptimizationFuel f) = ASSERT (f > 0) (OptimizationFuel (f - 1))
-unlimitedFuel = OptimizationFuel infiniteFuel
-
-data FuelState = FuelState { fs_fuel :: OptimizationFuel, fs_lastpass :: String }
-newtype FuelUniqSM a = FUSM { unFUSM :: FuelState -> UniqSM (a, FuelState) }
-
-fuelConsumingPass :: String -> FuelConsumer a -> FuelUniqSM a
-fuelConsumingPass name f = do setFuelPass name
-                              fuel <- fuelGet
-                              let (a, fuel') = f fuel
-                              fuelSet fuel'
-                              return a
-
-runFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       fuel <- readIORef (fuel_ref fs)
-       u    <- mkSplitUniqSupply 'u'
-       let (a, FuelState fuel' pass') = initUs_ u $ f (FuelState fuel pass)
-       writeIORef (pass_ref fs) pass'
-       writeIORef (fuel_ref fs) fuel'
-       return a
-
--- ToDo: Do we need the pass_ref when we are doing infinite fueld
--- transformations?
-runInfiniteFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
-runInfiniteFuelIO fs (FUSM f) =
-    do pass <- readIORef (pass_ref fs)
-       u <- mkSplitUniqSupply 'u'
-       let (a, FuelState _ pass') = initUs_ u $ f (FuelState unlimitedFuel pass)
-       writeIORef (pass_ref fs) pass'
-       return a
-
-instance Monad FuelUniqSM where
-  FUSM f >>= k = FUSM (\s -> f s >>= \(a, s') -> unFUSM (k a) s')
-  return a     = FUSM (\s -> return (a, s))
-
-instance MonadUnique FuelUniqSM where
-    getUniqueSupplyM = liftUniq getUniqueSupplyM
-    getUniqueM       = liftUniq getUniqueM
-    getUniquesM      = liftUniq getUniquesM
-
-liftUniq :: UniqSM x -> FuelUniqSM x
-liftUniq x = FUSM (\s -> x >>= (\u -> return (u, s)))
-
-class Monad m => FuelUsingMonad m where
-  fuelGet      :: m OptimizationFuel
-  fuelSet      :: OptimizationFuel -> m ()
-  lastFuelPass :: m String
-  setFuelPass  :: String -> m ()
-
-fuelExhausted :: FuelUsingMonad m => m Bool
-fuelExhausted = fuelGet >>= return . anyFuelLeft
-
-fuelDec1 :: FuelUsingMonad m => m ()
-fuelDec1 = fuelGet >>= fuelSet . oneLessFuel
-
-tryWithFuel :: FuelUsingMonad m => a -> m (Maybe a)
-tryWithFuel r = do f <- fuelGet
-                   if anyFuelLeft f then fuelSet (oneLessFuel f) >> return (Just r)
-                                    else return Nothing
-
-instance FuelUsingMonad FuelUniqSM where
-  fuelGet          = extract fs_fuel
-  lastFuelPass     = extract fs_lastpass
-  fuelSet fuel     = FUSM (\s -> return ((), s { fs_fuel     = fuel }))
-  setFuelPass pass = FUSM (\s -> return ((), s { fs_lastpass = pass }))
-
-extract :: (FuelState -> a) -> FuelUniqSM a
-extract f = FUSM (\s -> return (f s, s))
-
-instance FuelMonad FuelUniqSM where
-  getFuel = liftM amountOfFuel fuelGet
-  setFuel = fuelSet . tankFilledTo
-
--- Don't bother to checkpoint the unique supply; it doesn't matter
-instance CheckpointMonad FuelUniqSM where
-    type Checkpoint FuelUniqSM = FuelState
-    checkpoint = FUSM $ \fuel -> return (fuel, fuel) 
-    restart fuel = FUSM $ \_ -> return ((), fuel)
-
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 183708c08e..9717eea179 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -146,8 +146,6 @@ pprConvention  Slow                 = text "<slow-convention>"
 pprConvention  GC                   = text "<gc-convention>"
 pprConvention  PrimOpCall           = text "<primop-call-convention>"
 pprConvention  PrimOpReturn         = text "<primop-ret-convention>"
-pprConvention (Foreign c)           = ppr c
-pprConvention (Private {})          = text "<private-convention>"
 
 pprForeignConvention :: ForeignConvention -> SDoc
 pprForeignConvention (ForeignConvention c as rs) = ppr c <> ppr as <> ppr rs
@@ -229,9 +227,9 @@ pprNode node = pp_node <+> pp_debug
                                      , ptext (sLit ": goto")
                                      , ppr (head [ id | Just id <- ids]) <> semi ]
 
-      CmmCall tgt k out res updfr_off ->
+      CmmCall tgt k regs out res updfr_off ->
           hcat [ ptext (sLit "call"), space
-               , pprFun tgt, ptext (sLit "(...)"), space
+               , pprFun tgt, parens (interpp'SP regs), space
                , ptext (sLit "returns to") <+> ppr k <+> parens (ppr out)
                                                      <+> parens (ppr res)
                , ptext (sLit " with update frame") <+> ppr updfr_off
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 7503127555..119f2b7239 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -237,12 +237,8 @@ pprLocalReg (LocalReg uniq rep)
 
 -- Stack areas
 pprArea :: Area -> SDoc
-pprArea (RegSlot r)   = hcat [ text "slot<", ppr r, text ">" ]
-pprArea (CallArea id) = pprAreaId id
-
-pprAreaId :: AreaId -> SDoc
-pprAreaId Old        = text "old"
-pprAreaId (Young id) = hcat [ text "young<", ppr id, text ">" ]
+pprArea Old        = text "old"
+pprArea (Young id) = hcat [ text "young<", ppr id, text ">" ]
 
 -- needs to be kept in syn with CmmExpr.hs.GlobalReg
 --
diff --git a/compiler/cmm/SMRep.lhs b/compiler/cmm/SMRep.lhs
index ce30799bf6..8b3308ef97 100644
--- a/compiler/cmm/SMRep.lhs
+++ b/compiler/cmm/SMRep.lhs
@@ -21,6 +21,7 @@ module SMRep (
 	StgWord, StgHalfWord, 
 	hALF_WORD_SIZE, hALF_WORD_SIZE_IN_BITS,
 	WordOff, ByteOff,
+        roundUpToWords,
 
         -- * Closure repesentation
         SMRep(..),	-- CmmInfo sees the rep; no one else does
@@ -57,6 +58,7 @@ import FastString
 
 import Data.Char( ord )
 import Data.Word
+import Data.Bits
 \end{code}
 
 
@@ -69,6 +71,9 @@ import Data.Word
 \begin{code}
 type WordOff = Int	-- Word offset, or word count
 type ByteOff = Int	-- Byte offset, or byte count
+
+roundUpToWords :: ByteOff -> ByteOff
+roundUpToWords n = (n + (wORD_SIZE - 1)) .&. (complement (wORD_SIZE - 1))
 \end{code}
 
 StgWord is a type representing an StgWord on the target platform.
@@ -93,6 +98,7 @@ hALF_WORD_SIZE_IN_BITS = 32
 #endif
 \end{code}
 
+
 %************************************************************************
 %*									*
 \subsubsection[SMRep-datatype]{@SMRep@---storage manager representation}
diff --git a/compiler/cmm/cmm-notes b/compiler/cmm/cmm-notes
index 81882c8c0e..7f7107a18d 100644
--- a/compiler/cmm/cmm-notes
+++ b/compiler/cmm/cmm-notes
@@ -24,32 +24,6 @@ More notes (June 11)
 * Check in ClosureInfo:
      -- NB: Results here should line up with the results of SMRep.rtsClosureType
 
-* Possible refactoring: Nuke AGraph in favour of 
-      mkIfThenElse :: Expr -> Graph -> Graph -> FCode Graph
-  or even
-      mkIfThenElse :: HasUniques m => Expr -> Graph -> Graph -> m Graph
-  (Remmber that the .cmm file parser must use this function)
-
-  or parameterise FCode over its envt; the CgState part seem useful for both
-
-* "Remove redundant reloads" in CmmSpillReload should be redundant; since
-  insertLateReloads is now gone, every reload is reloading a live variable.
-  Test and nuke.
-
-* Stack layout is very like register assignment: find non-conflicting assigments.
-  In particular we can use colouring or linear scan (etc).
-
-  We'd fine-grain interference (on a word by word basis) to get maximum overlap.
-  But that may make very big interference graphs.  So linear scan might be
-  more attactive.
-
-  NB: linear scan does on-the-fly live range splitting.
-
-* When stubbing dead slots be careful not to write into an area that
-  overlaps with an area that's in use.  So stubbing needs to *follow* 
-  stack layout.
-
-
 More notes (May 11)
 ~~~~~~~~~~~~~~~~~~~
 In CmmNode, consider spliting CmmCall into two: call and jump
@@ -58,81 +32,16 @@ Notes on new codegen (Aug 10)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Things to do:
- - We insert spills for variables before the stack check! This is the reason for
-   some fishy code in StgCmmHeap.entryHeapCheck where we are doing some strange
-	things to fix up the stack pointer before GC calls/jumps.
-
-	The reason spills are inserted before the sp check is that at the entry to a
-	function we always store the parameters passed in registers to local variables.
-	The spill pass simply inserts spills at variable definitions. We instead should
-	sink the spills so that we can avoid spilling them on branches that never
-	reload them.
-
-	This will fix the spill before stack check problem but only really as a side
-	effect. A 'real fix' probably requires making the spiller know about sp checks.
-
-   EZY: I don't understand this comment. David Terei, can you clarify?
-
  - Proc points pass all arguments on the stack, adding more code and
    slowing down things a lot. We either need to fix this or even better
    would be to get rid of proc points.
 
- - CmmInfo.cmmToRawCmm uses Old.Cmm, so it is called after converting Cmm.Cmm to
-   Old.Cmm. We should abstract it to work on both representations, it needs only to
-   convert a CmmInfoTable to [CmmStatic].
-
- - The MkGraph currenty uses a different semantics for <*> than Hoopl. Maybe
-   we could convert codeGen/StgCmm* clients to the Hoopl's semantics?
-   It's all deeply unsatisfactory.
-
- - Improve performance of Hoopl.
-
-   A nofib comparison of -fasm vs -fnewcodegen nofib compilation parameters
-   (using the same ghc-cmm branch +libraries compiled by the old codegenerator)
-   is at http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.oldghchoopl.txt
-   - the code produced is 10.9% slower, the compilation is +118% slower!
-
-   The same comparison with ghc-head with zip representation is at
-   http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.oldghczip.txt
-   - the code produced is 11.7% slower, the compilation is +78% slower.
-
-   When compiling nofib, ghc-cmm + libraries compiled with -fnew-codegen
-   is 23.7% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.hooplghcoldgen.txt).
-   When compiling nofib, ghc-head + libraries compiled with -fnew-codegen
-   is 31.4% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.zipghcoldgen.txt).
-
-   So we generate a bit better code, but it takes us longer!
-
-   EZY: Also importantly, Hoopl uses dramatically more memory than the
-   old code generator.
-
- - Are all blockToNodeList and blockOfNodeList really needed? Maybe we could
-   splice blocks instead?
-
-   In the CmmContFlowOpt.blockConcat, using Dataflow seems too clumsy. Still,
-   a block catenation function would be probably nicer than blockToNodeList
-   / blockOfNodeList combo.
-
- - lowerSafeForeignCall seems too lowlevel. Just use Dataflow. After that
-   delete splitEntrySeq from HooplUtils.
-
- - manifestSP seems to touch a lot of the graph representation. It is
-   also slow for CmmSwitch nodes O(block_nodes * switch_statements).
-   Maybe rewrite manifestSP to use Dataflow?
-
  - Sort out Label, LabelMap, LabelSet versus BlockId, BlockEnv, BlockSet
    dichotomy. Mostly this means global replace, but we also need to make
    Label an instance of Outputable (probably in the Outputable module).
 
    EZY: We should use Label, since that's the terminology Hoopl uses.
 
- - NB that CmmProcPoint line 283 has a hack that works around a GADT-related
-   bug in 6.10.
-
- - SDM (2010-02-26) can we remove the Foreign constructor from Convention?
-   Reason: we never generate code for a function with the Foreign
-   calling convention, and the code for calling foreign calls is generated
-
  - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
    EZY (2011-04-16): The mini-inliner has been generalized and ported,
    but the constant folding and other optimizations need to still be
@@ -146,100 +55,25 @@ Things to do:
 	new native codegen, much in the way that we lower calling conventions.
 	Might need to be a bit sophisticated about aliasing.
 
- - Question: currently we lift procpoints to become separate
-   CmmProcs.  Do we still want to do this?
-    
-   NB: and advantage of continuing to do this is that
-   we can do common-proc elimination!
-
  - Move to new Cmm rep:
      * Make native CG consume New Cmm; 
      * Convert Old Cmm->New Cmm to keep old path alive
      * Produce New Cmm when reading in .cmm files
 
- - Consider module names
-
  - Top-level SRT threading is a bit ugly
 
- - Add type/newtype for CmmModule = [CmmGroup]    -- A module
-                        CmmGroup  = [CmmTop]      -- A .o file
-                        CmmTop    = Proc | Data   -- A procedure or data
-
- - This is a *change*: currently a CmmGroup is one function's-worth of code
-   regardless of SplitObjs.   Question: can we *always* generate M.o if there
-   is just one element in the list (rather than M/M1.o, M/M2.o etc)
-
-   One SRT per group.
-
  - See "CAFs" below; we want to totally refactor the way SRTs are calculated
 
- - Pull out Areas into its own module
-   Parameterise AreaMap (note there are type synonyms in CmmStackLayout!)
-   Add ByteWidth = Int
-   type SubArea    = (Area, ByteOff, ByteWidth) 
-   ByteOff should not be defined in SMRep -- that is too high up the hierarchy
-   
- - SMRep should not be imported by any module in cmm/!  Make it so.
-	-- ByteOff etc   ==>  CmmExpr
-        -- rET_SMALL etc ==> CmmInfo
-   Check that there are no other imports from codeGen in cmm/
-
- - If you eliminate a label by branch chain elimination,
-   what happens if there's an Area associated with that label?
-
- - Think about a non-flattened representation?
-
- - LastCall: 
-    * Use record fields for LastCall!
-    * cml_ret_off should be a ByteOff
-    * Split into 
-         LastCall (which has a successor) and
-         LastJump (which does not, includes return?)
-	   - does not have cml_cont, cml_ret_args, cml_ret_off
-	 LastForeignCall 
-           - safe! 
-           - expands into save/MidForeignCall/restore/goto
-	   - like any LastCall, target of the call gets an info table
-
- - JD: remind self of what goes wrong if you turn off the 
-   liveness of the update frame
-
  - Garbage-collect http://hackage.haskell.org/trac/ghc/wiki/Commentary/Compiler/CPS
    moving good stuff into 
    http://hackage.haskell.org/trac/ghc/wiki/Commentary/Compiler/NewCodeGenPipeline
 
-
- - We believe that all of CmmProcPoint.addProcPointProtocols is dead.  What
-   goes wrong if we simply never call it?
-
- - Something fishy in CmmStackLayout.hs
-   * In particular, 'getAreaSize' returns an AreaMap, but we *know* the width of
-	LocalRegs, so it'd be better to return FiniteMap AreaId ByteWidth
-   * setSuccSPs looks fishy.  Rather than lookin in procPoints, it could
-	just lookup the block in areaSize which, after all, has a binding
-	for precisely successors of calls.  All other blocks (including proc
-        points that are not successors of a call, we think) can be treated
-        uniformly: zero-size Area, and use inSP.
-
-
  - Currently AsmCodeGen top level calls AsmCodeGen.cmmToCmm, which is a small
    C-- optimiser.  It has quite a lot of boilerplate folding code in AsmCodeGen
    (cmmBlockConFold, cmmStmtConFold, cmmExprConFold), before calling out to
    CmmOpt.  ToDo: see what optimisations are being done; and do them before
    AsmCodeGen.
 
- - Modularise the CPS pipeline; instead of ...; A;B;C; ...
-                                use  ..; ABC; ....
-
- - Most of HscMain.tryNewCodeGen does not belong in HscMain.  Instead
-	if new_cg then
-             StgCmm.codeGen
-             processCmm  [including generating "raw" cmm]
-        else
-             CodeGen.codeGen
-             cmmToRawCmm
-
-
  - If we stick CAF and stack liveness info on a LastCall node (not LastRet/Jump)
    then all CAF and stack liveness stuff be completed before we split
    into separate C procedures.
@@ -312,9 +146,6 @@ ClosureInfo.lhs
 	Modules in cmm/
 ----------------------------------------------------
 
--------- Testing stuff ------------
-DynFlags: -frun-cpsz
-
 -------- Moribund stuff ------------
 OldCmm.hs      Definition of flowgraph of old representation
                Imports some data types from (new) Cmm
@@ -357,93 +188,6 @@ PprC.hs	            Pretty print Cmm in C syntax
 CLabel.hs           CLabel
 BlockId.hs          BlockId, BlockEnv, BlockSet
 
-----------------------------------------------------
-      Top-level structure
-----------------------------------------------------
-
-* New codgen called in HscMain.hscGenHardCode, by calling HscMain.tryNewCodeGen, 
-  enabled by -fnew-codegen (Opt_TryNewCodeGen)
-
-  THEN it calls CmmInfo.cmmToRawCmm to lay out the details of info tables
-      type Cmm    = GenCmm CmmStatic CmmInfo     (ListGraph CmmStmt)
-      type RawCmm = GenCmm CmmStatic [CmmStatic] (ListGraph CmmStmt)
-
-* HscMain.tryNewCodeGen
-    - STG->Cmm:         StgCmm.codeGen (new codegen)
-    - Optimize and CPS: CmmPipeline.cmmPipeline
-    - Convert:          CmmCvt.cmmOfZgraph (convert to old rep) very self contained
-
-* StgCmm.hs  The new STG -> Cmm conversion code generator
-  Lots of modules StgCmmXXX
-
-
-----------------------------------------------------
-      CmmPipeline.cmmPipeline   The new pipeline
-----------------------------------------------------
-
-CmmPipeline.cmmPipeline:
-   1. Do control flow optimization
-   2. Do cpsTop for each procedures separately
-   3. Build SRT representation; this spans multiple procedures
-        (unless split-objs)
-   4. Do control flow optimization on all resulting procedures
-
-cpsTop:
-  * CmmCommonBlockElim.elimCommonBlocks:
-	eliminate common blocks 
-
-  * CmmProcPoint.minimalProcPointSet
-	identify proc-points
-        no change to graph
-
-  * CmmProcPoint.addProcPointProtocols
-	something to do with the MA optimisation
-        probably entirely unnecessary
-
-  * Spill and reload:
-     - CmmSpillReload.dualLivenessWithInsertion
-       insert spills/reloads across 
-	   LastCalls, and 
-	   Branches to proc-points
-     Now sink those reloads (and other instructions):
-     - CmmSpillReload.rewriteAssignments
-     - CmmSpillReload.removeDeadAssignmentsAndReloads
-
-  * CmmStackLayout.stubSlotsOnDeath
-	debug only: zero out dead slots when they die
-
-  * Stack layout
-     - CmmStackLayout.lifeSlotAnal: 
-       find which sub-areas are live on entry to each block
-
-     - CmmStackLayout.layout
-       Lay out the stack, returning an AreaMap
-         type AreaMap = FiniteMap Area ByteOff
-          -- Byte offset of the oldest byte of the Area, 
-          -- relative to the oldest byte of the Old Area
-
-     - CmmStackLayout.manifestSP
-       Manifest the stack pointer
-
-   * Split into separate procedures
-
-      - CmmProcPoint.procPointAnalysis
-        Given set of proc points (computed earlier by
-           CmmProcPoint.minimalProcPointSet) find which blocks 
-           are reachable from each
-	Each block should be reachable from *one* proc point, so 
-	  the blocks reachable from P are the internal nodes of 
-	  the final procedure P
-        NB: if we the earlier analysis had produced too few proc-points 
-            we should nevertheless be fine by code duplication; but
-	    that is not implemented
-
-      - CmmProcPoint.splitAtProcPoints
-	Using this info, split into separate procedures
-
-      - CmmBuildInfoTables.setInfoTableStackMap
-	Attach stack maps to each info table
-
 
 ----------------------------------------------------
 	Proc-points
@@ -539,116 +283,3 @@ a dominator analysis, using the Dataflow Engine.
 * DECIDED: we can generate SRTs based on the final Cmm program
   without knowledge of how it is generated.
 
-----------------------------------------------------
-		Foreign calls
-----------------------------------------------------
-
-See Note [Foreign calls] in CmmNode!  This explains that a safe
-foreign call must do this:
-  save thread state
-  push info table (on thread stack) to describe frame
-  make call (via C stack)
-  pop info table
-  restore thread state
-and explains why this expansion must be done late in the day.
-
-Hence, 
-  - Every foreign call is represented as a middle node
-
-  - *Unsafe* foreign calls are simply "fat machine instructions"
-      and are passed along to the native code generator
-
-  - *Safe* foreign calls are "lowered" to unsafe calls by wrapping
-      them in the above save/restore sequence. This step is done
-      very late in the pipeline, just before handing to the native
-      code gen.   
-  
-      This lowering is done by BuildInfoTables.lowerSafeForeignCalls
-
-
-NEW PLAN for foreign calls:
-  - Unsafe foreign calls remain as a middle node (fat machine instruction)
-    Even the parameter passing is not lowered (just as machine instrs
-    get arguments).
-
-  - Initially, safe foreign calls appear as LastCalls with 
-	
-
-----------------------------------------------------
-		Cmm representations
-----------------------------------------------------
-
-* CmmDecl.hs
-     The type [GenCmm d h g] represents a whole module, 
-	** one list element per .o file **
-	Without SplitObjs, the list has exactly one element
-
-     newtype GenCmm d h g = Cmm [GenCmmTop d h g]  -- A whole .o file
-     data GenCmmTop d h g
-         = CmmProc h g           -- One procedure, graph d
-         | CmmData <stuff> [d]   -- Initialised data, items d
-
-  Old and new piplines use different representations
-  	(CmmCvt.hs converts between the two)
-
-
--------------
-OLD BACK END representations (OldCmm.hs):  
-      type Cmm = GenCmm CmmStatic CmmInfo (ListGraph CmmStmt)
-				-- A whole module
-      newtype ListGraph i = ListGraph [GenBasicBlock i]
-
-      data CmmStmt = Assign | Store | Return etc -- OLD BACK END ONLY
-
-
-   Once the info tables are laid out, we replace CmmInfo with [CmmStatic]
-      type RawCmm    = GenCmm CmmStatic [CmmStatic] (ListGraph CmmStmt)
-   which represents the info tables as data, that should 
-   immediately precede the code
-  
--------------
-NEW BACK END representations 
-* Uses Hoopl library, a zero-boot package
-* CmmNode defines a node of a flow graph.
-* Cmm defines CmmGraph, CmmTop, Cmm
-   - CmmGraph is a closed/closed graph + an entry node.
-
-       data CmmGraph = CmmGraph { g_entry :: BlockId
-                                , g_graph :: Graph CmmNode C C }
-
-   - CmmTop is a top level chunk, specialization of GenCmmTop from CmmDecl.hs
-       with CmmGraph as a flow graph.
-   - Cmm is a collection of CmmTops.
-
-       type Cmm          = GenCmm    CmmStatic CmmTopInfo CmmGraph
-       type CmmTop       = GenCmmTop CmmStatic CmmTopInfo CmmGraph
-
-   - CmmTop uses CmmTopInfo, which is a CmmInfoTable and CmmStackInfo
-
-       data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
-
-   - CmmStackInfo
-
-       data CmmStackInfo = StackInfo {arg_space :: ByteOff, updfr_space :: Maybe ByteOff}
-
-         * arg_space = SP offset on entry
-         * updfr_space space = SP offset on exit
-       Once the staci is manifested, we could drom CmmStackInfo, ie. get
-         GenCmm CmmStatic CmmInfoTable CmmGraph, but we do not do that currently.
-
-
-* MkGraph.hs: smart constructors for Cmm.hs
-  Beware, the CmmAGraph defined here does not use AGraph from Hoopl,
-  as CmmAGraph can be opened or closed at exit, See the notes in that module.
-
--------------
-* SHARED stuff
-  CmmDecl.hs - GenCmm and GenCmmTop types
-  CmmExpr.hs - defines the Cmm expression types
-             - CmmExpr, CmmReg, CmmLit, LocalReg, GlobalReg
-             - Area, AreaId etc     (separate module?)
-  CmmType.hs - CmmType, Width etc   (saparate module?)
-  CmmMachOp.hs - MachOp and CallishMachOp types
-
-  BlockId.hs defines  BlockId, BlockEnv, BlockSet
--------------
diff --git a/compiler/codeGen/CgInfoTbls.hs b/compiler/codeGen/CgInfoTbls.hs
index 6c77255a62..7cdb1b6f7e 100644
--- a/compiler/codeGen/CgInfoTbls.hs
+++ b/compiler/codeGen/CgInfoTbls.hs
@@ -36,7 +36,7 @@ import CgBindery
 import CgCallConv
 import CgUtils
 import CgMonad
-import CmmBuildInfoTables
+import CmmUtils
 
 import OldCmm
 import CLabel
@@ -66,10 +66,9 @@ emitClosureCodeAndInfoTable cl_info args body
 
 -- Convert from 'ClosureInfo' to 'CmmInfo'.
 -- Not used for return points.  (The 'smRepClosureTypeInt' call would panic.)
-mkCmmInfo :: ClosureInfo -> FCode CmmInfo
+mkCmmInfo :: ClosureInfo -> FCode CmmInfoTable
 mkCmmInfo cl_info
-  = return (CmmInfo gc_target Nothing $
-            CmmInfoTable { cit_lbl  = infoTableLabelFromCI cl_info,
+  = return (CmmInfoTable { cit_lbl  = infoTableLabelFromCI cl_info,
                    	   cit_rep  = closureSMRep cl_info,
                    	   cit_prof = prof,
                    	   cit_srt  = closureSRT cl_info })
@@ -79,14 +78,6 @@ mkCmmInfo cl_info
     ty_descr_w8  = stringToWord8s (closureTypeDescr cl_info)
     val_descr_w8 = stringToWord8s (closureValDescr cl_info)
 
-    -- The gc_target is to inform the CPS pass when it inserts a stack check.
-    -- Since that pass isn't used yet we'll punt for now.
-    -- When the CPS pass is fully integrated, this should
-    -- be replaced by the label that any heap check jumped to,
-    -- so that branch can be shared by both the heap (from codeGen)
-    -- and stack checks (from the CPS pass).
-    gc_target = panic "TODO: gc_target"
-
 -------------------------------------------------------------------------
 --
 --	Generating the info table and code for a return point
@@ -105,8 +96,7 @@ emitReturnTarget name stmts
 	; blks <- cgStmtsToBlocks stmts
         ; frame <- mkStackLayout
         ; let smrep    = mkStackRep (mkLiveness frame)
-              info     = CmmInfo gc_target Nothing info_tbl
-              info_tbl = CmmInfoTable { cit_lbl  = info_lbl
+              info     = CmmInfoTable { cit_lbl  = info_lbl
                                       , cit_prof = NoProfilingInfo
                                       , cit_rep  = smrep
                                       , cit_srt  = srt_info }
@@ -118,14 +108,6 @@ emitReturnTarget name stmts
     info_lbl  = mkReturnInfoLabel uniq
     entry_lbl = mkReturnPtLabel uniq
 
-    -- The gc_target is to inform the CPS pass when it inserts a stack check.
-    -- Since that pass isn't used yet we'll punt for now.
-    -- When the CPS pass is fully integrated, this should
-    -- be replaced by the label that any heap check jumped to,
-    -- so that branch can be shared by both the heap (from codeGen)
-    -- and stack checks (from the CPS pass).
-    gc_target = panic "TODO: gc_target"
-
 -- Build stack layout information from the state of the 'FCode' monad.
 -- Should go away once 'codeGen' starts using the CPS conversion
 -- pass to handle the stack.  Until then, this is really just
@@ -375,8 +357,8 @@ funInfoTable info_ptr
 
 emitInfoTableAndCode 
 	:: CLabel 		-- Label of entry or ret
-	-> CmmInfo 		-- ...the info table
-	-> [CmmFormal]	-- ...args
+        -> CmmInfoTable         -- ...the info table
+        -> [CmmFormal]          -- ...args
 	-> [CmmBasicBlock]	-- ...and body
 	-> Code
 
diff --git a/compiler/codeGen/CgMonad.lhs b/compiler/codeGen/CgMonad.lhs
index b96898f591..71da9e9ae0 100644
--- a/compiler/codeGen/CgMonad.lhs
+++ b/compiler/codeGen/CgMonad.lhs
@@ -13,8 +13,8 @@ stuff fits into the Big Picture.
 module CgMonad (
         Code, FCode,
 
-        initC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
-        returnFC, fixC, fixC_, checkedAbsC,
+        initC, runC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
+        returnFC, fixC, fixC_, checkedAbsC, 
         stmtC, stmtsC, labelC, emitStmts, nopC, whenC, newLabelC,
         newUnique, newUniqSupply,
 
@@ -386,11 +386,12 @@ instance Monad FCode where
 {-# INLINE thenFC #-}
 {-# INLINE returnFC #-}
 
-initC :: DynFlags -> Module -> FCode a -> IO a
-initC dflags mod (FCode code) = do
-    uniqs <- mkSplitUniqSupply 'c'
-    case code (initCgInfoDown dflags mod) (initCgState uniqs) of
-        (res, _) -> return res
+initC :: IO CgState
+initC  = do { uniqs <- mkSplitUniqSupply 'c'
+            ; return (initCgState uniqs) }
+
+runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState)
+runC dflags mod st (FCode code) = code (initCgInfoDown dflags mod) st
 
 returnFC :: a -> FCode a
 returnFC val = FCode $ \_ state -> (val, state)
@@ -708,7 +709,7 @@ emitDecl decl = do
     state <- getState
     setState $ state { cgs_tops = cgs_tops state `snocOL` decl }
 
-emitProc :: CmmInfo -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code
+emitProc :: CmmInfoTable -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code
 emitProc info lbl [] blocks = do
     let proc_block = CmmProc info lbl (ListGraph blocks)
     state <- getState
@@ -720,7 +721,7 @@ emitSimpleProc :: CLabel -> Code -> Code
 emitSimpleProc lbl code = do
     stmts <- getCgStmts code
     blks <- cgStmtsToBlocks stmts
-    emitProc (CmmInfo Nothing Nothing CmmNonInfoTable) lbl [] blks
+    emitProc CmmNonInfoTable lbl [] blks
 
 -- Get all the CmmTops (there should be no stmts)
 -- Return a single Cmm which may be split from other Cmms by
diff --git a/compiler/codeGen/CgStackery.lhs b/compiler/codeGen/CgStackery.lhs
index 2628760183..a869795caa 100644
--- a/compiler/codeGen/CgStackery.lhs
+++ b/compiler/codeGen/CgStackery.lhs
@@ -43,6 +43,7 @@ import OrdList
 import Outputable
 
 import Control.Monad
+import Data.List
 \end{code}
 
 %************************************************************************
@@ -333,7 +334,7 @@ Explicitly free some stack space.
 freeStackSlots :: [VirtualSpOffset] -> Code
 freeStackSlots extra_free
   = do	{ stk_usg <- getStkUsage
-	; let all_free = addFreeSlots (freeStk stk_usg) (sortLe (<=) extra_free)
+	; let all_free = addFreeSlots (freeStk stk_usg) (sort extra_free)
 	; let (new_vsp, new_free) = trim (virtSp stk_usg) all_free
 	; setStkUsage (stk_usg { virtSp = new_vsp, freeStk = new_free }) }
 
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index f971a0500a..e7d17c1f03 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -72,7 +72,9 @@ import Outputable
 
 import Data.Char
 import Data.Word
+import Data.List
 import Data.Maybe
+import Data.Ord
 
 -------------------------------------------------------------------------
 --
@@ -527,12 +529,10 @@ emitSwitch tag_expr branches mb_deflt lo_tag hi_tag
         ; let via_C | HscC <- hscTarget dflags = True
                     | otherwise                = False
 
-        ; stmts <- mk_switch tag_expr (sortLe le branches)
+        ; stmts <- mk_switch tag_expr (sortBy (comparing fst) branches)
                         mb_deflt_id lo_tag hi_tag via_C
         ; emitCgStmts stmts
         }
-  where
-    (t1,_) `le` (t2,_) = t1 <= t2
 
 
 mk_switch :: CmmExpr -> [(ConTagZ, CgStmts)]
@@ -699,10 +699,8 @@ emitLitSwitch _     []       deflt = emitCgStmts deflt
 emitLitSwitch scrut branches deflt_blk
   = do  { scrut' <- assignTemp scrut
         ; deflt_blk_id <- forkCgStmts deflt_blk
-        ; blk <- mk_lit_switch scrut' deflt_blk_id (sortLe le branches)
+        ; blk <- mk_lit_switch scrut' deflt_blk_id (sortBy (comparing fst) branches)
         ; emitCgStmts blk }
-  where
-    le (t1,_) (t2,_) = t1 <= t2
 
 mk_lit_switch :: CmmExpr -> BlockId
               -> [(Literal,CgStmts)]
diff --git a/compiler/codeGen/CodeGen.lhs b/compiler/codeGen/CodeGen.lhs
index ce12d43bbf..c9b2bf8ab0 100644
--- a/compiler/codeGen/CodeGen.lhs
+++ b/compiler/codeGen/CodeGen.lhs
@@ -30,7 +30,7 @@ import CgHpc
 
 import CLabel
 import OldCmm
-import OldPprCmm
+import OldPprCmm ()
 
 import StgSyn
 import PrelNames
@@ -45,40 +45,52 @@ import TyCon
 import Module
 import ErrUtils
 import Panic
+import Outputable
 import Util
 
+import OrdList
+import Stream (Stream, liftIO)
+import qualified Stream
+
+import Data.IORef
+
 codeGen :: DynFlags
         -> Module                     -- Module we are compiling
         -> [TyCon]                    -- Type constructors
         -> CollectedCCs               -- (Local/global) cost-centres needing declaring/registering.
         -> [(StgBinding,[(Id,[Id])])] -- Bindings to convert, with SRTs
         -> HpcInfo                    -- Profiling info
-        -> IO [CmmGroup]
+        -> Stream IO CmmGroup ()
               -- N.B. returning '[Cmm]' and not 'Cmm' here makes it
               -- possible for object splitting to split up the
               -- pieces later.
 
-codeGen dflags this_mod data_tycons cost_centre_info stg_binds hpc_info = do
-    showPass dflags "CodeGen"
-    code_stuff <-
-        initC dflags this_mod $ do
-            cmm_binds  <- mapM (getCmm . cgTopBinding dflags) stg_binds
-            cmm_tycons <- mapM cgTyCon data_tycons
-            cmm_init   <- getCmm (mkModuleInit dflags cost_centre_info this_mod hpc_info)
-            return (cmm_init : cmm_binds ++ cmm_tycons)
-                -- Put datatype_stuff after code_stuff, because the
-                -- datatype closure table (for enumeration types) to
-                -- (say) PrelBase_True_closure, which is defined in
-                -- code_stuff
-
-                -- Note [codegen-split-init] the cmm_init block must
-                -- come FIRST.  This is because when -split-objs is on
-                -- we need to combine this block with its
-                -- initialisation routines; see Note
-                -- [pipeline-split-init].
-
-    dumpIfSet_dyn dflags Opt_D_dump_cmm "Cmm" (pprCmms code_stuff)
-    return code_stuff
+codeGen dflags this_mod data_tycons cost_centre_info stg_binds hpc_info
+
+   = do { liftIO $ showPass dflags "CodeGen"
+
+        ; cgref <- liftIO $ newIORef =<< initC
+        ; let cg :: FCode CmmGroup -> Stream IO CmmGroup ()
+              cg fcode = do
+                cmm <- liftIO $ do
+                         st <- readIORef cgref
+                         let (a,st') = runC dflags this_mod st fcode
+
+                         dumpIfSet_dyn dflags Opt_D_dump_cmm "Cmm" $ ppr a
+
+                         -- NB. stub-out cgs_tops and cgs_stmts.  This fixes
+                         -- a big space leak.  DO NOT REMOVE!
+                         writeIORef cgref $! st'{ cgs_tops = nilOL,
+                                                  cgs_stmts = nilOL }
+                         return a
+                Stream.yield cmm
+
+        ; cg (getCmm $ mkModuleInit dflags cost_centre_info this_mod hpc_info)
+
+        ; mapM_ (cg . getCmm . cgTopBinding dflags) stg_binds
+
+        ; mapM_ (cg . cgTyCon) data_tycons
+        }
 
 mkModuleInit
         :: DynFlags
diff --git a/compiler/codeGen/StgCmm.hs b/compiler/codeGen/StgCmm.hs
index 17a7062559..696af8107e 100644
--- a/compiler/codeGen/StgCmm.hs
+++ b/compiler/codeGen/StgCmm.hs
@@ -46,6 +46,13 @@ import TyCon
 import Module
 import ErrUtils
 import Outputable
+import Stream
+
+import OrdList
+import MkGraph
+
+import Data.IORef
+import Control.Monad (when)
 import Util
 
 codeGen :: DynFlags
@@ -54,39 +61,51 @@ codeGen :: DynFlags
          -> CollectedCCs                -- (Local/global) cost-centres needing declaring/registering.
 	 -> [(StgBinding,[(Id,[Id])])]	-- Bindings to convert, with SRTs
 	 -> HpcInfo
-         -> IO [CmmGroup]         -- Output
+         -> Stream IO CmmGroup ()       -- Output as a stream, so codegen can
+                                        -- be interleaved with output
 
 codeGen dflags this_mod data_tycons
         cost_centre_info stg_binds hpc_info
-  = do  { showPass dflags "New CodeGen"
-
--- Why?
---   ; mapM_ (\x -> seq x (return ())) data_tycons
-
-        ; code_stuff <- initC dflags this_mod $ do 
-                { cmm_binds  <- mapM (getCmm . cgTopBinding dflags) stg_binds
-                ; cmm_tycons <- mapM cgTyCon data_tycons
-                ; cmm_init   <- getCmm (mkModuleInit cost_centre_info
-                                             this_mod hpc_info)
-                ; return (cmm_init : cmm_binds ++ cmm_tycons)
-                }
+  = do  { liftIO $ showPass dflags "New CodeGen"
+
+              -- cg: run the code generator, and yield the resulting CmmGroup
+              -- Using an IORef to store the state is a bit crude, but otherwise
+              -- we would need to add a state monad layer.
+        ; cgref <- liftIO $ newIORef =<< initC
+        ; let cg :: FCode () -> Stream IO CmmGroup ()
+              cg fcode = do
+                cmm <- liftIO $ do
+                         st <- readIORef cgref
+                         let (a,st') = runC dflags this_mod st (getCmm fcode)
+
+                         -- NB. stub-out cgs_tops and cgs_stmts.  This fixes
+                         -- a big space leak.  DO NOT REMOVE!
+                         writeIORef cgref $! st'{ cgs_tops = nilOL,
+                                                  cgs_stmts = mkNop }
+                         return a
+                yield cmm
+
+               -- Note [codegen-split-init] the cmm_init block must come
+               -- FIRST.  This is because when -split-objs is on we need to
+               -- combine this block with its initialisation routines; see
+               -- Note [pipeline-split-init].
+        ; cg (mkModuleInit cost_centre_info this_mod hpc_info)
+
+        ; mapM_ (cg . cgTopBinding dflags) stg_binds
+
                 -- Put datatype_stuff after code_stuff, because the
                 -- datatype closure table (for enumeration types) to
                 -- (say) PrelBase_True_closure, which is defined in
                 -- code_stuff
-
-                -- N.B. returning '[Cmm]' and not 'Cmm' here makes it
-                -- possible for object splitting to split up the
-                -- pieces later.
-
-                -- Note [codegen-split-init] the cmm_init block must
-                -- come FIRST.  This is because when -split-objs is on
-                -- we need to combine this block with its
-                -- initialisation routines; see Note
-                -- [pipeline-split-init].
-
-        ; return code_stuff }
-
+        ; let do_tycon tycon = do
+                -- Generate a table of static closures for an
+                -- enumeration type Note that the closure pointers are
+                -- tagged.
+                 when (isEnumerationTyCon tycon) $ cg (cgEnumerationTyCon tycon)
+                 mapM_ (cg . cgDataCon) (tyConDataCons tycon)
+
+        ; mapM_ do_tycon data_tycons
+        }
 
 ---------------------------------------------------------------
 --	Top-level bindings
@@ -108,7 +127,7 @@ cgTopBinding dflags (StgNonRec id rhs, _srts)
 	; info <- cgTopRhs id' rhs
 	; addBindC (cg_id info) info -- Add the *un-externalised* Id to the envt,
 				     -- so we find it when we look up occurrences
-	}
+        }
 
 cgTopBinding dflags (StgRec pairs, _srts)
   = do	{ let (bndrs, rhss) = unzip pairs
@@ -117,7 +136,7 @@ cgTopBinding dflags (StgRec pairs, _srts)
 	; fixC_(\ new_binds -> do 
 		{ addBindsC new_binds
 		; mapFCs ( \ (b,e) -> cgTopRhs b e ) pairs' })
-	; return () }
+        ; return () }
 
 -- Urgh!  I tried moving the forkStatics call from the rhss of cgTopRhs
 -- to enclose the listFCs in cgTopBinding, but that tickled the
@@ -187,65 +206,19 @@ mkModuleInit cost_centre_info this_mod hpc_info
         ; emitDecl (CmmData Data (Statics (mkPlainModuleInitLabel this_mod) []))
         }
 
+
 ---------------------------------------------------------------
 --	Generating static stuff for algebraic data types
 ---------------------------------------------------------------
 
-{-	[These comments are rather out of date]
-
-Macro  		    	     Kind of constructor
-CONST_INFO_TABLE@	Zero arity (no info -- compiler uses static closure)
-CHARLIKE_INFO_TABLE	Charlike   (no info -- compiler indexes fixed array)
-INTLIKE_INFO_TABLE	Intlike; the one macro generates both info tbls
-SPEC_INFO_TABLE		SPECish, and bigger than or equal to MIN_UPD_SIZE
-GEN_INFO_TABLE		GENish (hence bigger than or equal to MIN_UPD_SIZE@)
 
-Possible info tables for constructor con:
-
-* _con_info:
-  Used for dynamically let(rec)-bound occurrences of
-  the constructor, and for updates.  For constructors
-  which are int-like, char-like or nullary, when GC occurs,
-  the closure tries to get rid of itself.
-
-* _static_info:
-  Static occurrences of the constructor macro: STATIC_INFO_TABLE.
-
-For zero-arity constructors, \tr{con}, we NO LONGER generate a static closure;
-it's place is taken by the top level defn of the constructor.
-
-For charlike and intlike closures there is a fixed array of static
-closures predeclared.
--}
-
-cgTyCon :: TyCon -> FCode CmmGroup  -- All constructors merged together
-cgTyCon tycon
-  = do	{ constrs <- mapM (getCmm . cgDataCon) (tyConDataCons tycon)
-
-	    -- Generate a table of static closures for an enumeration type
-	    -- Put the table after the data constructor decls, because the
-	    -- datatype closure table (for enumeration types)
-	    -- to (say) PrelBase_$wTrue_closure, which is defined in code_stuff
-            -- Note that the closure pointers are tagged.
-
-            -- N.B. comment says to put table after constructor decls, but
-            -- code puts it before --- NR 16 Aug 2007
-	; extra <- cgEnumerationTyCon tycon
-
-        ; return (concat (extra ++ constrs))
-        }
-
-cgEnumerationTyCon :: TyCon -> FCode [CmmGroup]
+cgEnumerationTyCon :: TyCon -> FCode ()
 cgEnumerationTyCon tycon
-  | isEnumerationTyCon tycon
-  = do	{ tbl <- getCmm $ 
-		 emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs)
-	      	   [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs) 
-				 (tagForCon con)
-    	      	   | con <- tyConDataCons tycon]
-	; return [tbl] }
-  | otherwise
-  = return []
+  = emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs)
+             [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs)
+                           (tagForCon con)
+             | con <- tyConDataCons tycon]
+
 
 cgDataCon :: DataCon -> FCode ()
 -- Generate the entry code, info tables, and (for niladic constructor)
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs
index 3b166e3b6a..f98283f737 100644
--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -76,17 +76,17 @@ cgTopRhsClosure :: Id
 cgTopRhsClosure id ccs _ upd_flag srt args body = do
   {	-- LAY OUT THE OBJECT
     let name = idName id
-  ; lf_info  <- mkClosureLFInfo id TopLevel [] upd_flag args
-  ; srt_info <- getSRTInfo srt
+  ; lf_info <- mkClosureLFInfo id TopLevel [] upd_flag args
+  ; has_srt <- getSRTInfo srt
   ; mod_name <- getModuleName
   ; dflags   <- getDynFlags
   ; let descr         = closureDescription dflags mod_name name
-	closure_info  = mkClosureInfo True id lf_info 0 0 srt_info descr
-	closure_label = mkLocalClosureLabel name (idCafInfo id)
+        closure_info  = mkClosureInfo True id lf_info 0 0 descr
+        closure_label = mkLocalClosureLabel name (idCafInfo id)
     	cg_id_info    = litIdInfo id lf_info (CmmLabel closure_label)
         caffy         = idCafInfo id
         info_tbl      = mkCmmInfo closure_info -- XXX short-cut
-        closure_rep   = mkStaticClosureFields info_tbl ccs caffy []
+        closure_rep   = mkStaticClosureFields info_tbl ccs caffy has_srt []
 
   	 -- BUILD THE OBJECT, AND GENERATE INFO TABLE (IF NECESSARY)
   ; emitDataLits closure_label closure_rep
@@ -110,7 +110,7 @@ cgBind (StgNonRec name rhs)
         ; emit (init <*> body) }
 
 cgBind (StgRec pairs)
-  = do	{ ((new_binds, inits), body) <- getCodeR $ fixC (\ new_binds_inits ->
+  = do  { ((new_binds, inits), body) <- getCodeR $ fixC (\ new_binds_inits ->
                do { addBindsC $ fst new_binds_inits -- avoid premature deconstruction
                   ; liftM unzip $ listFCs [ cgRhs b e | (b,e) <- pairs ] })
        ; addBindsC new_binds
@@ -162,8 +162,8 @@ cgRhs :: Id -> StgRhs -> FCode (CgIdInfo, CmmAGraph)
 cgRhs name (StgRhsCon cc con args)
   = buildDynCon name cc con args
 
-cgRhs name (StgRhsClosure cc bi fvs upd_flag srt args body)
-  = mkRhsClosure name cc bi (nonVoidIds fvs) upd_flag srt args body
+cgRhs name (StgRhsClosure cc bi fvs upd_flag _srt args body)
+  = mkRhsClosure name cc bi (nonVoidIds fvs) upd_flag args body
 
 ------------------------------------------------------------------------
 --		Non-constructor right hand sides
@@ -171,7 +171,7 @@ cgRhs name (StgRhsClosure cc bi fvs upd_flag srt args body)
 
 mkRhsClosure :: Id -> CostCentreStack -> StgBinderInfo
 	     -> [NonVoid Id]			-- Free vars
-	     -> UpdateFlag -> SRT
+             -> UpdateFlag
 	     -> [Id]			        -- Args
 	     -> StgExpr
 	     -> FCode (CgIdInfo, CmmAGraph)
@@ -215,8 +215,7 @@ for semi-obvious reasons.
 mkRhsClosure	bndr cc bi
 		[NonVoid the_fv]   		-- Just one free var
 		upd_flag		-- Updatable thunk
-		_srt
-		[]			-- A thunk
+                []                      -- A thunk
 		body@(StgCase (StgApp scrutinee [{-no args-}])
 		      _ _ _ _   -- ignore uniq, etc.
 		      (AlgAlt _)
@@ -247,8 +246,7 @@ mkRhsClosure	bndr cc bi
 mkRhsClosure    bndr cc bi
 		fvs
 		upd_flag
-		_srt
-		[]			-- No args; a thunk
+                []                      -- No args; a thunk
 		body@(StgApp fun_id args)
 
   | args `lengthIs` (arity-1)
@@ -269,7 +267,7 @@ mkRhsClosure    bndr cc bi
 	arity 	= length fvs
 
 ---------- Default case ------------------
-mkRhsClosure bndr cc _ fvs upd_flag srt args body
+mkRhsClosure bndr cc _ fvs upd_flag args body
   = do	{ 	-- LAY OUT THE OBJECT
 	-- If the binder is itself a free variable, then don't store
 	-- it in the closure.  Instead, just bind it to Node on entry.
@@ -288,17 +286,16 @@ mkRhsClosure bndr cc _ fvs upd_flag srt args body
 	-- MAKE CLOSURE INFO FOR THIS CLOSURE
 	; lf_info <- mkClosureLFInfo bndr NotTopLevel fvs upd_flag args
 	; mod_name <- getModuleName
-	; c_srt <- getSRTInfo srt
-	; dflags <- getDynFlags
-	; let	name  = idName bndr
-		descr = closureDescription dflags mod_name name
-		fv_details :: [(NonVoid Id, VirtualHpOffset)]
+        ; dflags <- getDynFlags
+        ; let   name  = idName bndr
+                descr = closureDescription dflags mod_name name
+                fv_details :: [(NonVoid Id, VirtualHpOffset)]
 		(tot_wds, ptr_wds, fv_details)
 		   = mkVirtHeapOffsets (isLFThunk lf_info)
 				       (addIdReps (map stripNV reduced_fvs))
 		closure_info = mkClosureInfo False	-- Not static
 					     bndr lf_info tot_wds ptr_wds
-					     c_srt descr
+                                             descr
 
 	-- BUILD ITS INFO TABLE AND CODE
 	; forkClosureBody $
@@ -345,8 +342,7 @@ cgStdThunk bndr _cc _bndr_info _body lf_info payload
 	descr = closureDescription dflags mod_name (idName bndr)
 	closure_info = mkClosureInfo False 	-- Not static
 				     bndr lf_info tot_wds ptr_wds
-				     NoC_SRT	-- No SRT for a std-form closure
-				     descr
+                                     descr
 
 --  ; (use_cc, blame_cc) <- chooseDynCostCentres cc [{- no args-}] body
   ; let use_cc = curCCS; blame_cc = curCCS
@@ -546,10 +542,10 @@ emitBlackHoleCode is_single_entry = do
 
   whenC eager_blackholing $ do
     tickyBlackHole (not is_single_entry)
-    emit (mkStore (cmmOffsetW (CmmReg nodeReg) fixedHdrSize)
-                  (CmmReg (CmmGlobal CurrentTSO)))
+    emitStore (cmmOffsetW (CmmReg nodeReg) fixedHdrSize)
+                  (CmmReg (CmmGlobal CurrentTSO))
     emitPrimCall [] MO_WriteBarrier []
-    emit (mkStore (CmmReg nodeReg) (CmmReg (CmmGlobal EagerBlackholeInfo)))
+    emitStore (CmmReg nodeReg) (CmmReg (CmmGlobal EagerBlackholeInfo))
 
 setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
 	-- Nota Bene: this function does not change Node (even if it's a CAF),
@@ -598,7 +594,7 @@ pushUpdateFrame es body
        offset <- foldM push updfr es
        withUpdFrameOff offset body
      where push off e =
-             do emit (mkStore (CmmStackSlot (CallArea Old) base) e)
+             do emitStore (CmmStackSlot Old base) e
                 return base
              where base = off + widthInBytes (cmmExprWidth e)
 
@@ -666,13 +662,14 @@ link_caf _is_upd = do
         -- node is live, so save it.
 
   -- see Note [atomic CAF entry] in rts/sm/Storage.c
-  ; emit $ mkCmmIfThen
-      (CmmMachOp mo_wordEq [ CmmReg (CmmLocal ret), CmmLit zeroCLit]) $
+  ; updfr  <- getUpdFrameOff
+  ; emit =<< mkCmmIfThen
+      (CmmMachOp mo_wordEq [ CmmReg (CmmLocal ret), CmmLit zeroCLit])
         -- re-enter R1.  Doing this directly is slightly dodgy; we're
         -- assuming lots of things, like the stack pointer hasn't
         -- moved since we entered the CAF.
-        let target = entryCode (closureInfoPtr (CmmReg nodeReg)) in
-        mkJump target [] 0
+       (let target = entryCode (closureInfoPtr (CmmReg nodeReg)) in
+        mkJump target [] updfr)
 
   ; return hp_rel }
 
diff --git a/compiler/codeGen/StgCmmClosure.hs b/compiler/codeGen/StgCmmClosure.hs
index 483a67c1fa..8023abddec 100644
--- a/compiler/codeGen/StgCmmClosure.hs
+++ b/compiler/codeGen/StgCmmClosure.hs
@@ -657,7 +657,6 @@ data ClosureInfo
           -- the rest is just an unpacked CmmInfoTable.
         closureInfoLabel :: !CLabel,
         closureSMRep     :: !SMRep,          -- representation used by storage mgr
-        closureSRT       :: !C_SRT,          -- What SRT applies to this closure
         closureProf      :: !ProfilingInfo
     }
 
@@ -667,7 +666,7 @@ mkCmmInfo ClosureInfo {..}
   = CmmInfoTable { cit_lbl  = closureInfoLabel
                  , cit_rep  = closureSMRep
                  , cit_prof = closureProf
-                 , cit_srt  = closureSRT }
+                 , cit_srt  = NoC_SRT }
 
 
 --------------------------------------
@@ -678,16 +677,14 @@ mkClosureInfo :: Bool		-- Is static
 	      -> Id
 	      -> LambdaFormInfo 
 	      -> Int -> Int	-- Total and pointer words
-	      -> C_SRT
-	      -> String		-- String descriptor
+              -> String         -- String descriptor
 	      -> ClosureInfo
-mkClosureInfo is_static id lf_info tot_wds ptr_wds srt_info val_descr
+mkClosureInfo is_static id lf_info tot_wds ptr_wds val_descr
   = ClosureInfo { closureName      = name,
                   closureLFInfo    = lf_info,
-                  closureInfoLabel = info_lbl,
-                  closureSMRep     = sm_rep,    -- These four fields are a
-                  closureSRT       = srt_info,  --        CmmInfoTable
-                  closureProf      = prof }     -- ---
+                  closureInfoLabel = info_lbl,  -- These three fields are
+                  closureSMRep     = sm_rep,    -- (almost) an info table
+                  closureProf      = prof }     -- (we don't have an SRT yet)
   where
     name       = idName id
     sm_rep     = mkHeapRep is_static ptr_wds nonptr_wds (lfClosureType lf_info)
@@ -920,15 +917,21 @@ cafBlackHoleInfoTable
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-staticClosureNeedsLink :: CmmInfoTable -> Bool
+staticClosureNeedsLink :: Bool -> CmmInfoTable -> Bool
 -- A static closure needs a link field to aid the GC when traversing
 -- the static closure graph.  But it only needs such a field if either
 -- 	a) it has an SRT
 --	b) it's a constructor with one or more pointer fields
 -- In case (b), the constructor's fields themselves play the role
 -- of the SRT.
-staticClosureNeedsLink info_tbl@CmmInfoTable{ cit_rep = smrep }
+--
+-- At this point, the cit_srt field has not been calculated (that
+-- happens right at the end of the Cmm pipeline), but we do have the
+-- VarSet of CAFs that CoreToStg attached, and if that is empty there
+-- will definitely not be an SRT.
+--
+staticClosureNeedsLink has_srt CmmInfoTable{ cit_rep = smrep }
   | isConRep smrep         = not (isStaticNoCafCon smrep)
-  | otherwise              = needsSRT (cit_srt info_tbl)
-staticClosureNeedsLink _ = False
+  | otherwise              = has_srt -- needsSRT (cit_srt info_tbl)
+staticClosureNeedsLink _ _ = False
 
diff --git a/compiler/codeGen/StgCmmCon.hs b/compiler/codeGen/StgCmmCon.hs
index a7af5662e9..c348570a54 100644
--- a/compiler/codeGen/StgCmmCon.hs
+++ b/compiler/codeGen/StgCmmCon.hs
@@ -92,6 +92,7 @@ cgTopRhsCon id con args
                              info_tbl
                              dontCareCCS                -- Because it's static data
                              caffy                      -- Has CAF refs
+                             False                      -- no SRT
                              payload
 
                 -- BUILD THE OBJECT
diff --git a/compiler/codeGen/StgCmmEnv.hs b/compiler/codeGen/StgCmmEnv.hs
index f128e3ad60..2edd09da12 100644
--- a/compiler/codeGen/StgCmmEnv.hs
+++ b/compiler/codeGen/StgCmmEnv.hs
@@ -27,7 +27,7 @@ module StgCmmEnv (
 
 	bindArgsToRegs, bindToReg, rebindToReg,
 	bindArgToReg, idToReg,
-	getArgAmode, getNonVoidArgAmodes, 
+        getArgAmode, getNonVoidArgAmodes,
 	getCgIdInfo, 
 	maybeLetNoEscape, 
     ) where
@@ -212,7 +212,6 @@ getNonVoidArgAmodes (arg:args)
 	 	   ; amodes <- getNonVoidArgAmodes args
 	 	   ; return ( amode : amodes ) }
 
-
 ------------------------------------------------------------------------
 --	Interface functions for binding and re-binding names
 ------------------------------------------------------------------------
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs
index 9faad02f46..dd1abc23be 100644
--- a/compiler/codeGen/StgCmmExpr.hs
+++ b/compiler/codeGen/StgCmmExpr.hs
@@ -77,7 +77,7 @@ cgExpr (StgLetNoEscape _ _ binds expr) =
      ; let join_id = mkBlockId (uniqFromSupply us)
      ; cgLneBinds join_id binds
      ; cgExpr expr 
-     ; emit $ mkLabel join_id}
+     ; emitLabel join_id}
 
 cgExpr (StgCase expr _live_vars _save_vars bndr srt alt_type alts) =
   cgCase expr bndr srt alt_type alts
@@ -130,7 +130,7 @@ cgLetNoEscapeRhs
 cgLetNoEscapeRhs join_id local_cc bndr rhs =
   do { (info, rhs_body) <- getCodeR $ cgLetNoEscapeRhsBody local_cc bndr rhs 
      ; let (bid, _) = expectJust "cgLetNoEscapeRhs" $ maybeLetNoEscape info
-     ; emit (outOfLine $ mkLabel bid <*> rhs_body <*> mkBranch join_id)
+     ; emitOutOfLine bid $ rhs_body <*> mkBranch join_id
      ; return info
      }
 
@@ -278,21 +278,69 @@ Hence: two basic plans for
 data GcPlan
   = GcInAlts 		-- Put a GC check at the start the case alternatives,
 	[LocalReg] 	-- which binds these registers
-	SRT		-- using this SRT
-  | NoGcInAlts		-- The scrutinee is a primitive value, or a call to a
+  | NoGcInAlts          -- The scrutinee is a primitive value, or a call to a
 			-- primitive op which does no GC.  Absorb the allocation
 			-- of the case alternative(s) into the upstream check
 
 -------------------------------------
--- See Note [case on Bool]
 cgCase :: StgExpr -> Id -> SRT -> AltType -> [StgAlt] -> FCode ()
+
+cgCase (StgOpApp (StgPrimOp op) args _) bndr _srt (AlgAlt tycon) alts
+  | isEnumerationTyCon tycon -- Note [case on bool]
+  = do { tag_expr <- do_enum_primop op args
+
+       -- If the binder is not dead, convert the tag to a constructor
+       -- and assign it.
+       ; when (not (isDeadBinder bndr)) $ do
+            { tmp_reg <- bindArgToReg (NonVoid bndr)
+            ; emitAssign (CmmLocal tmp_reg)
+                         (tagToClosure tycon tag_expr) }
+
+       ; (mb_deflt, branches) <- cgAlgAltRhss NoGcInAlts Nothing
+                                              (NonVoid bndr) alts
+       ; emitSwitch tag_expr branches mb_deflt 0 (tyConFamilySize tycon - 1)
+       }
+  where
+    do_enum_primop :: PrimOp -> [StgArg] -> FCode CmmExpr
+    do_enum_primop TagToEnumOp [arg]  -- No code!
+      = getArgAmode (NonVoid arg)
+    do_enum_primop primop args
+      = do tmp <- newTemp bWord
+           cgPrimOp [tmp] primop args
+           return (CmmReg (CmmLocal tmp))
+
 {-
-cgCase (OpApp ) bndr srt AlgAlt [(DataAlt flase, a2]
-  | isBoolTy (idType bndr)
-  , isDeadBndr bndr
-  = 
+Note [case on bool]
+
+This special case handles code like
+
+  case a <# b of
+    True ->
+    False ->
+
+If we let the ordinary case code handle it, we'll get something like
+
+ tmp1 = a < b
+ tmp2 = Bool_closure_tbl[tmp1]
+ if (tmp2 & 7 != 0) then ... // normal tagged case
+
+but this junk won't optimise away.  What we really want is just an
+inline comparison:
+
+ if (a < b) then ...
+
+So we add a special case to generate
+
+ tmp1 = a < b
+ if (tmp1 == 0) then ...
+
+and later optimisations will further improve this.
+
+We should really change all these primops to return Int# instead, that
+would make this special case go away.
 -}
 
+
   -- Note [ticket #3132]: we might be looking at a case of a lifted Id
   -- that was cast to an unlifted type.  The Id will always be bottom,
   -- but we don't want the code generator to fall over here.  If we
@@ -319,7 +367,7 @@ cgCase (StgApp v []) bndr _ alt_type@(PrimAlt _) alts
     do { when (not reps_compatible) $
            panic "cgCase: reps do not match, perhaps a dodgy unsafeCoerce?"
        ; v_info <- getCgIdInfo v
-       ; emit (mkAssign (CmmLocal (idToReg (NonVoid bndr))) (idInfoToAmode v_info))
+       ; emitAssign (CmmLocal (idToReg (NonVoid bndr))) (idInfoToAmode v_info)
        ; _ <- bindArgsToRegs [NonVoid bndr]
        ; cgAlts NoGcInAlts (NonVoid bndr) alt_type alts }
   where
@@ -330,8 +378,11 @@ cgCase scrut@(StgApp v []) _ _ (PrimAlt _) _
     do { mb_cc <- maybeSaveCostCentre True
        ; withSequel (AssignTo [idToReg (NonVoid v)] False) (cgExpr scrut)
        ; restoreCurrentCostCentre mb_cc
-       ; emit $ mkComment $ mkFastString "should be unreachable code"
-       ; emit $ withFreshLabel "l" (\l -> mkLabel l <*> mkBranch l)}
+       ; emitComment $ mkFastString "should be unreachable code"
+       ; l <- newLabelC
+       ; emitLabel l
+       ; emit (mkBranch l)
+       }
 
 {-
 case seq# a s of v
@@ -349,7 +400,7 @@ cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr srt alt_type alts
   = -- handle seq#, same return convention as vanilla 'a'.
     cgCase (StgApp a []) bndr srt alt_type alts
 
-cgCase scrut bndr srt alt_type alts 
+cgCase scrut bndr _srt alt_type alts
   = -- the general case
     do { up_hp_usg <- getVirtHp        -- Upstream heap usage
        ; let ret_bndrs = chooseReturnBndrs bndr alt_type alts
@@ -359,7 +410,7 @@ cgCase scrut bndr srt alt_type alts
                       | isSingleton alts = False
                       | up_hp_usg > 0    = False
                       | otherwise        = True
-             gc_plan = if gcInAlts then GcInAlts alt_regs srt else NoGcInAlts
+             gc_plan = if gcInAlts then GcInAlts alt_regs else NoGcInAlts
 
        ; mb_cc <- maybeSaveCostCentre simple_scrut
        ; withSequel (AssignTo alt_regs gcInAlts) (cgExpr scrut)
@@ -417,14 +468,14 @@ chooseReturnBndrs _ _ _ = panic "chooseReturnBndrs"
 cgAlts :: GcPlan -> NonVoid Id -> AltType -> [StgAlt] -> FCode ()
 -- At this point the result of the case are in the binders
 cgAlts gc_plan _bndr PolyAlt [(_, _, _, rhs)]
-  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+  = maybeAltHeapCheck gc_plan Nothing (cgExpr rhs)
   
 cgAlts gc_plan _bndr (UbxTupAlt _) [(_, _, _, rhs)]
-  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+  = maybeAltHeapCheck gc_plan Nothing (cgExpr rhs)
 	-- Here bndrs are *already* in scope, so don't rebind them
 
 cgAlts gc_plan bndr (PrimAlt _) alts
-  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts
+  = do  { tagged_cmms <- cgAltRhss gc_plan Nothing bndr alts
 
 	; let bndr_reg = CmmLocal (idToReg bndr)
 	      (DEFAULT,deflt) = head tagged_cmms
@@ -433,20 +484,17 @@ cgAlts gc_plan bndr (PrimAlt _) alts
 
 	      tagged_cmms' = [(lit,code) 
 			     | (LitAlt lit, code) <- tagged_cmms]
-	; emit (mkCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt) }
+        ; emitCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt }
 
 cgAlts gc_plan bndr (AlgAlt tycon) alts
-  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts
-	
+  = do  { retry_lbl <- newLabelC
+        ; emitLabel retry_lbl -- Note [alg-alt heap checks]
+
+        ; (mb_deflt, branches) <- cgAlgAltRhss gc_plan (Just retry_lbl)
+                                               bndr alts
+
 	; let fam_sz   = tyConFamilySize tycon
 	      bndr_reg = CmmLocal (idToReg bndr)
-	      mb_deflt = case tagged_cmms of
-			   ((DEFAULT,rhs) : _) -> Just rhs
-			   _other	       -> Nothing
-		-- DEFAULT is always first, if present
-
-	      branches = [ (dataConTagZ con, cmm) 
-	   	         | (DataAlt con, cmm) <- tagged_cmms ]
 
                     -- Is the constructor tag in the node reg?
         ; if isSmallFamily fam_sz
@@ -467,23 +515,68 @@ cgAlts gc_plan bndr (AlgAlt tycon) alts
 cgAlts _ _ _ _ = panic "cgAlts"
 	-- UbxTupAlt and PolyAlt have only one alternative
 
+
+-- Note [alg-alt heap check]
+--
+-- In an algebraic case with more than one alternative, we will have
+-- code like
+--
+-- L0:
+--   x = R1
+--   goto L1
+-- L1:
+--   if (x & 7 >= 2) then goto L2 else goto L3
+-- L2:
+--   Hp = Hp + 16
+--   if (Hp > HpLim) then goto L4
+--   ...
+-- L4:
+--   call gc() returns to L5
+-- L5:
+--   x = R1
+--   goto L1
+
 -------------------
-cgAltRhss :: GcPlan -> NonVoid Id -> [StgAlt] -> FCode [(AltCon, CmmAGraph)]
-cgAltRhss gc_plan bndr alts
+cgAlgAltRhss :: GcPlan -> Maybe BlockId -> NonVoid Id -> [StgAlt]
+             -> FCode ( Maybe CmmAGraph
+                      , [(ConTagZ, CmmAGraph)] )
+cgAlgAltRhss gc_plan retry_lbl bndr alts
+  = do { tagged_cmms <- cgAltRhss gc_plan retry_lbl bndr alts
+
+       ; let { mb_deflt = case tagged_cmms of
+                           ((DEFAULT,rhs) : _) -> Just rhs
+                           _other              -> Nothing
+                            -- DEFAULT is always first, if present
+
+              ; branches = [ (dataConTagZ con, cmm)
+                           | (DataAlt con, cmm) <- tagged_cmms ]
+              }
+
+       ; return (mb_deflt, branches)
+       }
+
+
+-------------------
+cgAltRhss :: GcPlan -> Maybe BlockId -> NonVoid Id -> [StgAlt]
+          -> FCode [(AltCon, CmmAGraph)]
+cgAltRhss gc_plan retry_lbl bndr alts
   = forkAlts (map cg_alt alts)
   where
     base_reg = idToReg bndr
     cg_alt :: StgAlt -> FCode (AltCon, CmmAGraph)
     cg_alt (con, bndrs, _uses, rhs)
       = getCodeR		  $
-	maybeAltHeapCheck gc_plan $
+        maybeAltHeapCheck gc_plan retry_lbl $
 	do { _ <- bindConArgs con base_reg bndrs
 	   ; cgExpr rhs
 	   ; return con }
 
-maybeAltHeapCheck :: GcPlan -> FCode a -> FCode a
-maybeAltHeapCheck NoGcInAlts        code = code
-maybeAltHeapCheck (GcInAlts regs _) code = altHeapCheck regs code
+maybeAltHeapCheck :: GcPlan -> Maybe BlockId -> FCode a -> FCode a
+maybeAltHeapCheck NoGcInAlts      _    code = code
+maybeAltHeapCheck (GcInAlts regs) mlbl code =
+  case mlbl of
+     Nothing -> altHeapCheck regs code
+     Just retry_lbl -> altHeapCheckReturnsTo regs retry_lbl code
 
 -----------------------------------------------------------------------------
 -- 	Tail calls
@@ -517,8 +610,8 @@ cgIdApp fun_id args
 cgLneJump :: BlockId -> [LocalReg] -> [StgArg] -> FCode ()
 cgLneJump blk_id lne_regs args	-- Join point; discard sequel
   = do	{ cmm_args <- getNonVoidArgAmodes args
-      	; emit (mkMultiAssign lne_regs cmm_args
-		<*> mkBranch blk_id) }
+        ; emitMultiAssign lne_regs cmm_args
+        ; emit (mkBranch blk_id) }
     
 cgTailCall :: Id -> CgIdInfo -> [StgArg] -> FCode ()
 cgTailCall fun_id fun_info args = do
@@ -529,65 +622,91 @@ cgTailCall fun_id fun_info args = do
       	ReturnIt -> emitReturn [fun]	-- ToDo: does ReturnIt guarantee tagged?
     
       	EnterIt -> ASSERT( null args )	-- Discarding arguments
-      		do { let fun' = CmmLoad fun (cmmExprType fun)
-                   ; [ret,call] <- forkAlts [
-      			getCode $ emitReturn [fun],	-- Is tagged; no need to untag
-      			getCode $ do -- emit (mkAssign nodeReg fun)
-                         emitCall (NativeNodeCall, NativeReturn)
-                                  (entryCode fun') [fun]]  -- Not tagged
-      		   ; emit (mkCmmIfThenElse (cmmIsTagged fun) ret call) }
-
-      	SlowCall -> do 	    -- A slow function call via the RTS apply routines
+                   emitEnter fun
+
+        SlowCall -> do      -- A slow function call via the RTS apply routines
       		{ tickySlowCall lf_info args
-                ; emit $ mkComment $ mkFastString "slowCall"
+                ; emitComment $ mkFastString "slowCall"
       		; slowCall fun args }
     
       	-- A direct function call (possibly with some left-over arguments)
       	DirectEntry lbl arity -> do
 		{ tickyDirectCall arity args
- 		; if node_points then
-                    do emit $ mkComment $ mkFastString "directEntry"
-                       emit (mkAssign nodeReg fun)
-                       directCall lbl arity args
-		  else do emit $ mkComment $ mkFastString "directEntry else"
-                          directCall lbl arity args }
+                ; if node_points
+                     then directCall NativeNodeCall   lbl arity (fun_arg:args)
+                     else directCall NativeDirectCall lbl arity args }
 
 	JumpToIt {} -> panic "cgTailCall"	-- ???
 
   where
-    fun_name 	= idName            fun_id
+    fun_arg     = StgVarArg fun_id
+    fun_name    = idName            fun_id
     fun         = idInfoToAmode     fun_info
     lf_info     = cgIdInfoLF        fun_info
     node_points = nodeMustPointToIt lf_info
 
 
-{- Note [case on Bool]
-   ~~~~~~~~~~~~~~~~~~~
-A case on a Boolean value does two things:
-  1. It looks up the Boolean in a closure table and assigns the
-     result to the binder.
-  2. It branches to the True or False case through analysis
-     of the closure assigned to the binder.
-But the indirection through the closure table is unnecessary
-if the assignment to the binder will be dead code (use isDeadBndr).
+emitEnter :: CmmExpr -> FCode ()
+emitEnter fun = do
+  { adjustHpBackwards
+  ; sequel <- getSequel
+  ; updfr_off <- getUpdFrameOff
+  ; case sequel of
+      -- For a return, we have the option of generating a tag-test or
+      -- not.  If the value is tagged, we can return directly, which
+      -- is quicker than entering the value.  This is a code
+      -- size/speed trade-off: when optimising for speed rather than
+      -- size we could generate the tag test.
+      --
+      -- Right now, we do what the old codegen did, and omit the tag
+      -- test, just generating an enter.
+      Return _ -> do
+        { let entry = entryCode $ closureInfoPtr $ CmmReg nodeReg
+        ; emit $ mkForeignJump NativeNodeCall entry
+                    [cmmUntag fun] updfr_off
+        }
+
+      -- The result will be scrutinised in the sequel.  This is where
+      -- we generate a tag-test to avoid entering the closure if
+      -- possible.
+      --
+      -- The generated code will be something like this:
+      --
+      --    R1 = fun  -- copyout
+      --    if (fun & 7 != 0) goto Lcall else goto Lret
+      --  Lcall:
+      --    call [fun] returns to Lret
+      --  Lret:
+      --    fun' = R1  -- copyin
+      --    ...
+      --
+      -- Note in particular that the label Lret is used as a
+      -- destination by both the tag-test and the call.  This is
+      -- becase Lret will necessarily be a proc-point, and we want to
+      -- ensure that we generate only one proc-point for this
+      -- sequence.
+      --
+      AssignTo res_regs _ -> do
+       { lret <- newLabelC
+       ; lcall <- newLabelC
+       ; let area = Young lret
+       ; let (off, copyin) = copyInOflow NativeReturn area res_regs
+             (outArgs, regs, copyout) = copyOutOflow NativeNodeCall Call area
+                                          [fun] updfr_off (0,[])
+         -- refer to fun via nodeReg after the copyout, to avoid having
+         -- both live simultaneously; this sometimes enables fun to be
+         -- inlined in the RHS of the R1 assignment.
+       ; let entry = entryCode (closureInfoPtr (CmmReg nodeReg))
+             the_call = toCall entry (Just lret) updfr_off off outArgs regs
+       ; emit $
+           copyout <*>
+           mkCbranch (cmmIsTagged (CmmReg nodeReg)) lret lcall <*>
+           outOfLine lcall the_call <*>
+           mkLabel lret <*>
+           copyin
+       }
+  }
 
-The following example illustrates how badly the code turns out:
-  STG:
-    case <=## [ww_s7Hx y_s7HD] of wild2_sbH8 {
-      GHC.Types.False -> <true  code> // sbH8 dead
-      GHC.Types.True  -> <false code> // sbH8 dead
-    };
-  Cmm:
-    _s7HD::F64 = F64[_sbH7::I64 + 7];  // MidAssign
-    _ccsW::I64 = %MO_F_Le_W64(_s7Hx::F64, _s7HD::F64);  // MidAssign
-    // emitReturn  // MidComment
-    _sbH8::I64 = I64[ghczmprim_GHCziBool_Bool_closure_tbl + (_ccsW::I64 << 3)];  // MidAssign
-    _ccsX::I64 = _sbH8::I64 & 7;  // MidAssign
-    if (_ccsX::I64 >= 2) goto ccsH; else goto ccsI;  // LastCondBranch
-
-The assignments to _sbH8 and _ccsX are completely unnecessary.
-Instead, we should branch based on the value of _ccsW.
--}
 
 {- Note [Better Alt Heap Checks]
 If two function calls can share a return point, then they will also
diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs
index 5bc0f7af4e..c67e0e0c95 100644
--- a/compiler/codeGen/StgCmmForeign.hs
+++ b/compiler/codeGen/StgCmmForeign.hs
@@ -22,6 +22,7 @@ import StgCmmEnv
 import StgCmmMonad
 import StgCmmUtils
 import StgCmmClosure
+import StgCmmLayout
 
 import BlockId
 import Cmm
@@ -45,15 +46,16 @@ import Control.Monad
 -- Code generation for Foreign Calls
 -----------------------------------------------------------------------------
 
-cgForeignCall :: [LocalReg]             -- r1,r2  where to put the results
-              -> [ForeignHint]
-              -> ForeignCall            -- the op
+-- | emit code for a foreign call, and return the results to the sequel.
+--
+cgForeignCall :: ForeignCall            -- the op
               -> [StgArg]               -- x,y    arguments
+              -> Type                   -- result type
               -> FCode ()
--- Emits code for an unsafe foreign call:      r1, r2 = foo( x, y, z )
 
-cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_args
+cgForeignCall (CCall (CCallSpec target cconv safety)) stg_args res_ty
   = do  { cmm_args <- getFCallArgs stg_args
+        ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
         ; let ((call_args, arg_hints), cmm_target)
                 = case target of
                    StaticTarget _   _      False ->
@@ -63,7 +65,7 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
                                 = case mPkgId of
                                         Nothing         -> ForeignLabelInThisPackage
                                         Just pkgId      -> ForeignLabelInPackage pkgId
-                            size        = call_size cmm_args
+                            size = call_size cmm_args
                         in  ( unzip cmm_args
                             , CmmLit (CmmLabel
                                         (mkForeignLabel lbl size labelSource IsFunction)))
@@ -71,13 +73,31 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
                    DynamicTarget    ->  case cmm_args of
                                            (fn,_):rest -> (unzip rest, fn)
                                            [] -> panic "cgForeignCall []"
-              fc = ForeignConvention cconv arg_hints result_hints
+              fc = ForeignConvention cconv arg_hints res_hints
               call_target = ForeignTarget cmm_target fc
 
-        ; srt <- getSRTInfo NoSRT        -- SLPJ: Not sure what SRT
-                                        -- is right here
-                                        -- JD: Does it matter in the new codegen?
-        ; emitForeignCall safety results call_target call_args srt CmmMayReturn }
+        -- we want to emit code for the call, and then emitReturn.
+        -- However, if the sequel is AssignTo, we shortcut a little
+        -- and generate a foreign call that assigns the results
+        -- directly.  Otherwise we end up generating a bunch of
+        -- useless "r = r" assignments, which are not merely annoying:
+        -- they prevent the common block elimination from working correctly
+        -- in the case of a safe foreign call.
+        -- See Note [safe foreign call convention]
+        --
+        ; sequel <- getSequel
+        ; case sequel of
+            AssignTo assign_to_these _ ->
+                do { emitForeignCall safety assign_to_these call_target
+                                     call_args CmmMayReturn
+                   }
+
+            _something_else ->
+                do { emitForeignCall safety res_regs call_target
+                                     call_args CmmMayReturn
+                   ; emitReturn (map (CmmReg . CmmLocal) res_regs)
+                   }
+         }
   where
         -- in the stdcall calling convention, the symbol needs @size appended
         -- to it, where size is the total number of bytes of arguments.  We
@@ -88,16 +108,83 @@ cgForeignCall results result_hints (CCall (CCallSpec target cconv safety)) stg_a
         | otherwise            = Nothing
 
         -- ToDo: this might not be correct for 64-bit API
-      arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType arg) wORD_SIZE
+      arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType arg)
+                               wORD_SIZE
+
+{- Note [safe foreign call convention]
+
+The simple thing to do for a safe foreign call would be the same as an
+unsafe one: just
+
+    emitForeignCall ...
+    emitReturn ...
+
+but consider what happens in this case
+
+   case foo x y z of
+     (# s, r #) -> ...
+
+The sequel is AssignTo [r].  The call to newUnboxedTupleRegs picks [r]
+as the result reg, and we generate
+
+  r = foo(x,y,z) returns to L1  -- emitForeignCall
+ L1:
+  r = r  -- emitReturn
+  goto L2
+L2:
+  ...
+
+Now L1 is a proc point (by definition, it is the continuation of the
+safe foreign call).  If L2 does a heap check, then L2 will also be a
+proc point.
+
+Furthermore, the stack layout algorithm has to arrange to save r
+somewhere between the call and the jump to L1, which is annoying: we
+would have to treat r differently from the other live variables, which
+have to be saved *before* the call.
+
+So we adopt a special convention for safe foreign calls: the results
+are copied out according to the NativeReturn convention by the call,
+and the continuation of the call should copyIn the results.  (The
+copyOut code is actually inserted when the safe foreign call is
+lowered later).  The result regs attached to the safe foreign call are
+only used temporarily to hold the results before they are copied out.
+
+We will now generate this:
+
+  r = foo(x,y,z) returns to L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+And when the safe foreign call is lowered later (see Note [lower safe
+foreign calls]) we get this:
+
+  suspendThread()
+  r = foo(x,y,z)
+  resumeThread()
+  R1 = r  -- copyOut, inserted by lowerSafeForeignCall
+  jump L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+Now consider what happens if L2 does a heap check: the Adams
+optimisation kicks in and commons up L1 with the heap-check
+continuation, resulting in just one proc point instead of two. Yay!
+-}
+
 
 emitCCall :: [(CmmFormal,ForeignHint)]
           -> CmmExpr
           -> [(CmmActual,ForeignHint)]
           -> FCode ()
 emitCCall hinted_results fn hinted_args
-  = emitForeignCall PlayRisky results target args
-                    NoC_SRT -- No SRT b/c we PlayRisky
-                    CmmMayReturn
+  = emitForeignCall PlayRisky results target args CmmMayReturn
   where
     (args, arg_hints) = unzip hinted_args
     (results, result_hints) = unzip hinted_results
@@ -107,7 +194,7 @@ emitCCall hinted_results fn hinted_args
 
 emitPrimCall :: [CmmFormal] -> CallishMachOp -> [CmmActual] -> FCode ()
 emitPrimCall res op args
-  = emitForeignCall PlayRisky res (PrimTarget op) args NoC_SRT CmmMayReturn
+  = emitForeignCall PlayRisky res (PrimTarget op) args CmmMayReturn
 
 -- alternative entry point, used by CmmParse
 emitForeignCall
@@ -115,11 +202,10 @@ emitForeignCall
         -> [CmmFormal]          -- where to put the results
         -> ForeignTarget        -- the op
         -> [CmmActual]          -- arguments
-        -> C_SRT                -- the SRT of the calls continuation
         -> CmmReturnInfo        -- This can say "never returns"
                                 --   only RTS procedures do this
         -> FCode ()
-emitForeignCall safety results target args _srt _ret
+emitForeignCall safety results target args _ret
   | not (playSafe safety) = do
     let (caller_save, caller_load) = callerSaveVolatileRegs
     emit caller_save
@@ -129,7 +215,9 @@ emitForeignCall safety results target args _srt _ret
   | otherwise = do
     updfr_off <- getUpdFrameOff
     temp_target <- load_target_into_temp target
-    emit $ mkSafeCall temp_target results args updfr_off (playInterruptible safety)
+    emit =<< mkSafeCall temp_target results args updfr_off
+                (playInterruptible safety)
+
 
 
 {-
@@ -162,7 +250,7 @@ maybe_assign_temp e
         -- expressions, which are wrong here.
         -- this is a NonPtr because it only duplicates an existing
         reg <- newTemp (cmmExprType e) --TODO FIXME NOW
-        emit (mkAssign (CmmLocal reg) e)
+        emitAssign (CmmLocal reg) e
         return (CmmReg (CmmLocal reg))
 
 -- -----------------------------------------------------------------------------
@@ -184,12 +272,12 @@ saveThreadState =
 emitSaveThreadState :: BlockId -> FCode ()
 emitSaveThreadState bid = do
   -- CurrentTSO->stackobj->sp = Sp;
-  emit $ mkStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord) stack_SP)
-                 (CmmStackSlot (CallArea (Young bid)) (widthInBytes (typeWidth gcWord)))
+  emitStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord) stack_SP)
+                 (CmmStackSlot (Young bid) (widthInBytes (typeWidth gcWord)))
   emit closeNursery
   -- and save the current cost centre stack in the TSO when profiling:
   when opt_SccProfilingOn $
-        emit (mkStore (cmmOffset stgCurrentTSO tso_CCCS) curCCS)
+        emitStore (cmmOffset stgCurrentTSO tso_CCCS) curCCS
 
    -- CurrentNursery->free = Hp+1;
 closeNursery :: CmmAGraph
diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs
index 25161722f7..bc61cf5b97 100644
--- a/compiler/codeGen/StgCmmHeap.hs
+++ b/compiler/codeGen/StgCmmHeap.hs
@@ -10,7 +10,7 @@ module StgCmmHeap (
         getVirtHp, setVirtHp, setRealHp,
         getHpRelOffset, hpRel,
 
-        entryHeapCheck, altHeapCheck,
+        entryHeapCheck, altHeapCheck, altHeapCheckReturnsTo,
 
         mkVirtHeapOffsets, mkVirtConstrOffsets,
         mkStaticClosureFields, mkStaticClosure,
@@ -20,7 +20,6 @@ module StgCmmHeap (
 
 #include "HsVersions.h"
 
-import CmmType
 import StgSyn
 import CLabel
 import StgCmmLayout
@@ -34,6 +33,7 @@ import StgCmmEnv
 
 import MkGraph
 
+import Hoopl hiding ((<*>), mkBranch)
 import SMRep
 import Cmm
 import CmmUtils
@@ -45,6 +45,8 @@ import FastString( mkFastString, fsLit )
 import Constants
 import Util
 
+import Control.Monad (when)
+
 -----------------------------------------------------------
 --              Initialise dynamic heap objects
 -----------------------------------------------------------
@@ -109,7 +111,7 @@ allocDynClosureCmm info_tbl lf_info use_cc _blame_cc amodes_w_offsets
 
         -- ALLOCATE THE OBJECT
         ; base <- getHpRelOffset info_offset
-        ; emit (mkComment $ mkFastString "allocDynClosure")
+        ; emitComment $ mkFastString "allocDynClosure"
         ; emitSetDynHdr base info_ptr  use_cc
         ; let (cmm_args, offsets) = unzip amodes_w_offsets
         ; hpStore base cmm_args offsets
@@ -151,9 +153,10 @@ mkStaticClosureFields
         :: CmmInfoTable
         -> CostCentreStack
         -> CafInfo
+        -> Bool                 -- SRT is non-empty?
         -> [CmmLit]             -- Payload
         -> [CmmLit]             -- The full closure
-mkStaticClosureFields info_tbl ccs caf_refs payload
+mkStaticClosureFields info_tbl ccs caf_refs has_srt payload
   = mkStaticClosure info_lbl ccs payload padding
         static_link_field saved_info_field
   where
@@ -178,8 +181,10 @@ mkStaticClosureFields info_tbl ccs caf_refs payload
         | otherwise  = ASSERT(null payload) [mkIntCLit 0]
 
     static_link_field
-        | is_caf || staticClosureNeedsLink info_tbl = [static_link_value]
-        | otherwise                                 = []
+        | is_caf || staticClosureNeedsLink has_srt info_tbl
+        = [static_link_value]
+        | otherwise
+        = []
 
     saved_info_field
         | is_caf     = [mkIntCLit 0]
@@ -335,11 +340,12 @@ entryHeapCheck cl_info offset nodeSet arity args code
 
            args' = map (CmmReg . CmmLocal) args
            setN = case nodeSet of
-                          Just n  -> mkAssign nodeReg (CmmReg $ CmmLocal n)
+                          Just _  -> mkNop -- No need to assign R1, it already
+                                           -- points to the closure
                           Nothing -> mkAssign nodeReg $
                               CmmLit (CmmLabel $ staticClosureLabel cl_info)
 
-           {- Thunks:          Set R1 = node, jump GCEnter1
+           {- Thunks:          jump GCEnter1
               Function (fast): Set R1 = node, jump GCFun
               Function (slow): Set R1 = node, call generic_gc -}
            gc_call upd = setN <*> gc_lbl upd
@@ -354,7 +360,10 @@ entryHeapCheck cl_info offset nodeSet arity args code
             - GC calls, but until then this fishy code works -}
 
        updfr_sz <- getUpdFrameOff
-       heapCheck True (gc_call updfr_sz) code
+
+       loop_id <- newLabelC
+       emitLabel loop_id
+       heapCheck True (gc_call updfr_sz <*> mkBranch loop_id) code
 
 {-
     -- This code is slightly outdated now and we could easily keep the above
@@ -400,21 +409,29 @@ entryHeapCheck cl_info offset nodeSet arity args code
 -}
 
 
---------------------------------------------------------------
--- A heap/stack check at in a case alternative
+-- ------------------------------------------------------------
+-- A heap/stack check in a case alternative
 
 altHeapCheck :: [LocalReg] -> FCode a -> FCode a
 altHeapCheck regs code
+  = do loop_id <- newLabelC
+       emitLabel loop_id
+       altHeapCheckReturnsTo regs loop_id code
+
+altHeapCheckReturnsTo :: [LocalReg] -> Label -> FCode a -> FCode a
+altHeapCheckReturnsTo regs retry_lbl code
   = do updfr_sz <- getUpdFrameOff
-       heapCheck False (gc_call updfr_sz) code
+       gc_call_code <- gc_call updfr_sz
+       heapCheck False (gc_call_code <*> mkBranch retry_lbl) code
 
   where
     reg_exprs = map (CmmReg . CmmLocal) regs
+      -- Note [stg_gc arguments]
 
     gc_call sp =
         case rts_label regs of
-             Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp
-             Nothing -> mkCall generic_gc (GC, GC) [] [] sp
+             Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp (0,[])
+             Nothing -> mkCall generic_gc (GC, GC) [] [] sp (0,[])
 
     rts_label [reg]
         | isGcPtrType ty = Just (mkGcLabel "stg_gc_unpt_r1")
@@ -432,6 +449,23 @@ altHeapCheck regs code
 
     rts_label _ = Nothing
 
+-- Note [stg_gc arguments]
+-- It might seem that we could avoid passing the arguments to the
+-- stg_gc function, because they are already in the right registers.
+-- While this is usually the case, it isn't always.  Sometimes the
+-- code generator has cleverly avoided the eval in a case, e.g. in
+-- ffi/should_run/4221.hs we found
+--
+--   case a_r1mb of z
+--     FunPtr x y -> ...
+--
+-- where a_r1mb is bound a top-level constructor, and is known to be
+-- evaluated.  The codegen just assigns x, y and z, and continues;
+-- R1 is never assigned.
+--
+-- So we'll have to rely on optimisations to eliminatethese
+-- assignments where possible.
+
 
 -- | The generic GC procedure; no params, no results
 generic_gc :: CmmExpr
@@ -447,7 +481,7 @@ heapCheck checkStack do_gc code
   = getHeapUsage $ \ hpHw ->
     -- Emit heap checks, but be sure to do it lazily so
     -- that the conditionals on hpHw don't cause a black hole
-    do  { emit $ do_checks checkStack hpHw do_gc
+    do  { codeOnly $ do_checks checkStack hpHw do_gc
         ; tickyAllocHeap hpHw
         ; doGranAllocate hpHw
         ; setRealHp hpHw
@@ -456,22 +490,20 @@ heapCheck checkStack do_gc code
 do_checks :: Bool       -- Should we check the stack?
           -> WordOff    -- Heap headroom
           -> CmmAGraph  -- What to do on failure
-          -> CmmAGraph
-do_checks checkStack alloc do_gc
-  = withFreshLabel "gc" $ \ loop_id ->
-    withFreshLabel "gc" $ \ gc_id   ->
-      mkLabel loop_id
-      <*> (let hpCheck = if alloc == 0 then mkNop
-                         else mkAssign hpReg bump_hp <*>
-                              mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
-           in if checkStack
-                 then mkCmmIfThenElse sp_oflo (mkBranch gc_id) hpCheck
-                 else hpCheck)
-      <*> mkComment (mkFastString "outOfLine should follow:")
-      <*> outOfLine (mkLabel gc_id
-                     <*> mkComment (mkFastString "outOfLine here")
-                     <*> do_gc
-                     <*> mkBranch loop_id)
+          -> FCode ()
+do_checks checkStack alloc do_gc = do
+  gc_id <- newLabelC
+
+  when checkStack $
+     emit =<< mkCmmIfGoto sp_oflo gc_id
+
+  when (alloc /= 0) $ do
+     emitAssign hpReg bump_hp
+     emit =<< mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
+
+  emitOutOfLine gc_id $
+     do_gc -- this is expected to jump back somewhere
+
                 -- Test for stack pointer exhaustion, then
                 -- bump heap pointer, and test for heap exhaustion
                 -- Note that we don't move the heap pointer unless the
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index 86986efdfa..9c17716b1b 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -15,7 +15,7 @@
 
 module StgCmmLayout (
 	mkArgDescr, 
-	emitCall, emitReturn,
+        emitCall, emitReturn, adjustHpBackwards,
 
 	emitClosureProcAndInfoTable,
 	emitClosureAndInfoTable,
@@ -41,10 +41,12 @@ import StgCmmEnv
 import StgCmmTicky
 import StgCmmMonad
 import StgCmmUtils
+import StgCmmProf
 
 import MkGraph
 import SMRep
 import Cmm
+import CmmUtils
 import CLabel
 import StgSyn
 import Id
@@ -52,6 +54,7 @@ import Name
 import TyCon		( PrimRep(..) )
 import BasicTypes	( RepArity )
 import StaticFlags
+import Module
 
 import Constants
 import Util
@@ -63,38 +66,60 @@ import FastString
 --		Call and return sequences
 ------------------------------------------------------------------------
 
-emitReturn :: [CmmExpr] -> FCode ()
--- Return multiple values to the sequel
+-- | Return multiple values to the sequel
+--
+-- If the sequel is @Return@
+--
+-- >     return (x,y)
+--
+-- If the sequel is @AssignTo [p,q]@
+--
+-- >    p=x; q=y;
 --
--- If the sequel is Return
---	return (x,y)
--- If the sequel is AssignTo [p,q]
---	p=x; q=y; 
+emitReturn :: [CmmExpr] -> FCode ()
 emitReturn results
   = do { sequel    <- getSequel;
        ; updfr_off <- getUpdFrameOff
-       ; emit $ mkComment $ mkFastString ("emitReturn: " ++ show sequel)
+       ; emitComment $ mkFastString ("emitReturn: " ++ show sequel)
        ; case sequel of
            Return _ ->
              do { adjustHpBackwards
                 ; emit (mkReturnSimple results updfr_off) }
            AssignTo regs adjust ->
              do { if adjust then adjustHpBackwards else return ()
-                ; emit (mkMultiAssign  regs results) }
+                ; emitMultiAssign  regs results }
        }
 
+
+-- | @emitCall conv fun args@ makes a call to the entry-code of @fun@,
+-- using the call/return convention @conv@, passing @args@, and
+-- returning the results to the current sequel.
+--
 emitCall :: (Convention, Convention) -> CmmExpr -> [CmmExpr] -> FCode ()
--- (cgCall fun args) makes a call to the entry-code of 'fun', 
--- passing 'args', and returning the results to the current sequel
-emitCall convs@(callConv, _) fun args
+emitCall convs fun args
+  = emitCallWithExtraStack convs fun args noExtraStack
+
+
+-- | @emitCallWithExtraStack conv fun args stack@ makes a call to the
+-- entry-code of @fun@, using the call/return convention @conv@,
+-- passing @args@, pushing some extra stack frames described by
+-- @stack@, and returning the results to the current sequel.
+--
+emitCallWithExtraStack
+   :: (Convention, Convention) -> CmmExpr -> [CmmExpr]
+   -> (ByteOff, [(CmmExpr,ByteOff)]) -> FCode ()
+emitCallWithExtraStack convs@(callConv, _) fun args extra_stack
   = do	{ adjustHpBackwards
 	; sequel <- getSequel
 	; updfr_off <- getUpdFrameOff
-        ; emit $ mkComment $ mkFastString ("emitCall: " ++ show sequel)
-	; case sequel of
-	    Return _            -> emit (mkForeignJump callConv fun args updfr_off)
-	    AssignTo res_regs _ -> emit (mkCall fun convs res_regs args updfr_off)
-    }
+        ; emitComment $ mkFastString ("emitCallWithExtraStack: " ++ show sequel)
+        ; case sequel of
+            Return _ ->
+              emit $ mkForeignJumpExtra callConv fun args updfr_off extra_stack
+            AssignTo res_regs _ -> do
+              emit =<< mkCall fun convs res_regs args updfr_off extra_stack
+      }
+
 
 adjustHpBackwards :: FCode ()
 -- This function adjusts and heap pointers just before a tail call or
@@ -127,59 +152,142 @@ adjustHpBackwards
 --	Making calls: directCall and slowCall
 -------------------------------------------------------------------------
 
-directCall :: CLabel -> RepArity -> [StgArg] -> FCode ()
+-- General plan is:
+--   - we'll make *one* fast call, either to the function itself
+--     (directCall) or to stg_ap_<pat>_fast (slowCall)
+--     Any left-over arguments will be pushed on the stack,
+--
+--     e.g. Sp[old+8]  = arg1
+--          Sp[old+16] = arg2
+--          Sp[old+32] = stg_ap_pp_info
+--          R2 = arg3
+--          R3 = arg4
+--          call f() return to Nothing updfr_off: 32
+
+
+directCall :: Convention -> CLabel -> RepArity -> [StgArg] -> FCode ()
 -- (directCall f n args)
 -- calls f(arg1, ..., argn), and applies the result to the remaining args
 -- The function f has arity n, and there are guaranteed at least n args
 -- Both arity and args include void args
-directCall lbl arity stg_args 
-  = do	{ cmm_args <- getNonVoidArgAmodes stg_args
-	; direct_call "directCall" lbl arity cmm_args (argsReps stg_args) }
+directCall conv lbl arity stg_args
+  = do  { argreps <- getArgRepsAmodes stg_args
+        ; direct_call "directCall" conv lbl arity argreps }
+
 
 slowCall :: CmmExpr -> [StgArg] -> FCode ()
 -- (slowCall fun args) applies fun to args, returning the results to Sequel
 slowCall fun stg_args 
-  = do	{ cmm_args <- getNonVoidArgAmodes stg_args
-	; slow_call fun cmm_args (argsReps stg_args) }
+  = do  { dflags <- getDynFlags
+        ; argsreps <- getArgRepsAmodes stg_args
+        ; let (rts_fun, arity) = slowCallPattern (map fst argsreps)
+        ; direct_call "slow_call" NativeNodeCall
+                 (mkRtsApFastLabel rts_fun) arity ((P,Just fun):argsreps)
+        ; emitComment $ mkFastString ("slow_call for " ++
+                                      showSDoc dflags (ppr fun) ++
+                                      " with pat " ++ unpackFS rts_fun)
+        }
+
 
 --------------
-direct_call :: String -> CLabel -> RepArity -> [CmmExpr] -> [ArgRep] -> FCode ()
--- NB1: (length args) may be less than (length reps), because
---     the args exclude the void ones
--- NB2: 'arity' refers to the *reps* 
-direct_call caller lbl arity args reps
-  | debugIsOn && arity > length reps	-- Too few args
+direct_call :: String
+            -> Convention     -- e.g. NativeNodeCall or NativeDirectCall
+            -> CLabel -> RepArity
+            -> [(ArgRep,Maybe CmmExpr)] -> FCode ()
+direct_call caller call_conv lbl arity args
+  | debugIsOn && real_arity > length args  -- Too few args
   = do -- Caller should ensure that there enough args!
-       pprPanic "direct_call" (text caller <+> ppr arity
-                           <+> ppr lbl <+> ppr (length reps)
-                           <+> ppr args <+> ppr reps )
-
-  | null rest_reps     -- Precisely the right number of arguments
-  = emitCall (NativeDirectCall, NativeReturn) target args
-
-  | otherwise		-- Over-saturated call
-  = ASSERT( arity == length initial_reps )
-    do	{ pap_id <- newTemp gcWord
-	; withSequel (AssignTo [pap_id] True)
-		     (emitCall (NativeDirectCall, NativeReturn) target fast_args)
-	; slow_call (CmmReg (CmmLocal pap_id)) 
-		    rest_args rest_reps }
+       pprPanic "direct_call" $
+            text caller <+> ppr arity <+>
+            ppr lbl <+> ppr (length args) <+>
+            ppr (map snd args) <+> ppr (map fst args)
+
+  | null rest_args  -- Precisely the right number of arguments
+  = emitCall (call_conv, NativeReturn) target (nonVArgs args)
+
+  | otherwise       -- Note [over-saturated calls]
+  = emitCallWithExtraStack (call_conv, NativeReturn)
+                           target (nonVArgs fast_args) (mkStkOffsets stack_args)
   where
     target = CmmLit (CmmLabel lbl)
-    (initial_reps, rest_reps) = splitAt arity reps
-    arg_arity = count isNonV initial_reps
-    (fast_args, rest_args) = splitAt arg_arity args
+    (fast_args, rest_args) = splitAt real_arity args
+    stack_args = slowArgs rest_args
+    real_arity = case call_conv of
+                   NativeNodeCall -> arity+1
+                   _              -> arity
 
---------------
-slow_call :: CmmExpr -> [CmmExpr] -> [ArgRep] -> FCode ()
-slow_call fun args reps
-  = do dflags <- getDynFlags
-       call <- getCode $ direct_call "slow_call" (mkRtsApFastLabel rts_fun) arity args reps
-       emit $ mkComment $ mkFastString ("slow_call for " ++ showSDoc dflags (ppr fun) ++
-                                        " with pat " ++ unpackFS rts_fun)
-       emit (mkAssign nodeReg fun <*> call)
+
+-- When constructing calls, it is easier to keep the ArgReps and the
+-- CmmExprs zipped together.  However, a void argument has no
+-- representation, so we need to use Maybe CmmExpr (the alternative of
+-- using zeroCLit or even undefined would work, but would be ugly).
+--
+getArgRepsAmodes :: [StgArg] -> FCode [(ArgRep, Maybe CmmExpr)]
+getArgRepsAmodes = mapM getArgRepAmode
+  where getArgRepAmode arg
+           | V <- rep  = return (V, Nothing)
+           | otherwise = do expr <- getArgAmode (NonVoid arg)
+                            return (rep, Just expr)
+           where rep = toArgRep (argPrimRep arg)
+
+nonVArgs :: [(ArgRep, Maybe CmmExpr)] -> [CmmExpr]
+nonVArgs [] = []
+nonVArgs ((_,Nothing)  : args) = nonVArgs args
+nonVArgs ((_,Just arg) : args) = arg : nonVArgs args
+
+{-
+Note [over-saturated calls]
+
+The natural thing to do for an over-saturated call would be to call
+the function with the correct number of arguments, and then apply the
+remaining arguments to the value returned, e.g.
+
+  f a b c d   (where f has arity 2)
+  -->
+  r = call f(a,b)
+  call r(c,d)
+
+but this entails
+  - saving c and d on the stack
+  - making a continuation info table
+  - at the continuation, loading c and d off the stack into regs
+  - finally, call r
+
+Note that since there are a fixed number of different r's
+(e.g.  stg_ap_pp_fast), we can also pre-compile continuations
+that correspond to each of them, rather than generating a fresh
+one for each over-saturated call.
+
+Not only does this generate much less code, it is faster too.  We will
+generate something like:
+
+Sp[old+16] = c
+Sp[old+24] = d
+Sp[old+32] = stg_ap_pp_info
+call f(a,b) -- usual calling convention
+
+For the purposes of the CmmCall node, we count this extra stack as
+just more arguments that we are passing on the stack (cml_args).
+-}
+
+-- | 'slowArgs' takes a list of function arguments and prepares them for
+-- pushing on the stack for "extra" arguments to a function which requires
+-- fewer arguments than we currently have.
+slowArgs :: [(ArgRep, Maybe CmmExpr)] -> [(ArgRep, Maybe CmmExpr)]
+slowArgs [] = []
+slowArgs args -- careful: reps contains voids (V), but args does not
+  | opt_SccProfilingOn = save_cccs ++ this_pat ++ slowArgs rest_args
+  | otherwise          =              this_pat ++ slowArgs rest_args
   where
-    (rts_fun, arity) = slowCallPattern reps
+    (arg_pat, n)            = slowCallPattern (map fst args)
+    (call_args, rest_args)  = splitAt n args
+
+    stg_ap_pat = mkCmmRetInfoLabel rtsPackageId arg_pat
+    this_pat   = (N, Just (mkLblExpr stg_ap_pat)) : call_args
+    save_cccs  = [(N, Just (mkLblExpr save_cccs_lbl)), (N, Just curCCS)]
+    save_cccs_lbl = mkCmmRetInfoLabel rtsPackageId (fsLit "stg_restore_cccs")
+
+
 
 -- These cases were found to cover about 99% of all slow calls:
 slowCallPattern :: [ArgRep] -> (FastString, RepArity)
@@ -202,6 +310,30 @@ slowCallPattern []		      = (fsLit "stg_ap_0", 0)
 
 
 -------------------------------------------------------------------------
+-- Fix the byte-offsets of a bunch of things to push on the stack
+
+-- This is used for pushing slow-call continuations.
+-- See Note [over-saturated calls].
+
+mkStkOffsets
+  :: [(ArgRep, Maybe CmmExpr)]    -- things to make offsets for
+  -> ( ByteOff                    -- OUTPUTS: Topmost allocated word
+     , [(CmmExpr, ByteOff)] )     -- things with offsets (voids filtered out)
+mkStkOffsets things
+    = loop 0 [] (reverse things)
+  where
+    loop offset offs [] = (offset,offs)
+    loop offset offs ((_,Nothing):things) = loop offset offs things
+	-- ignore Void arguments
+    loop offset offs ((rep,Just thing):things)
+        = loop thing_off ((thing, thing_off):offs) things
+	where
+          thing_off = offset + argRepSizeW rep * wORD_SIZE
+	    -- offset of thing is offset+size, because we're 
+	    -- growing the stack *downwards* as the offsets increase.
+
+
+-------------------------------------------------------------------------
 --	Classifying arguments: ArgRep
 -------------------------------------------------------------------------
 
@@ -237,10 +369,7 @@ isNonV :: ArgRep -> Bool
 isNonV V = False
 isNonV _ = True
 
-argsReps :: [StgArg] -> [ArgRep]
-argsReps = map (toArgRep . argPrimRep)
-
-argRepSizeW :: ArgRep -> WordOff		-- Size in words
+argRepSizeW :: ArgRep -> WordOff                -- Size in words
 argRepSizeW N = 1
 argRepSizeW P = 1
 argRepSizeW F = 1
diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs
index 4eea38e22c..602bdebcad 100644
--- a/compiler/codeGen/StgCmmMonad.hs
+++ b/compiler/codeGen/StgCmmMonad.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE GADTs #-}
 -----------------------------------------------------------------------------
 --
 -- Monad for Stg to C-- code generation
@@ -16,16 +17,22 @@
 module StgCmmMonad (
 	FCode,	-- type
 
-	initC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
+        initC, runC, thenC, thenFC, listCs, listFCs, mapCs, mapFCs,
 	returnFC, fixC, fixC_, nopC, whenC, 
 	newUnique, newUniqSupply, 
 
+        newLabelC, emitLabel,
+
 	emit, emitDecl, emitProc, emitProcWithConvention, emitSimpleProc,
+        emitOutOfLine, emitAssign, emitStore, emitComment,
 
 	getCmm, cgStmtsToBlocks,
 	getCodeR, getCode, getHeapUsage,
 
-	forkClosureBody, forkStatics, forkAlts, forkProc, codeOnly,
+        mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto,
+        mkCall, mkCmmCall, mkSafeCall,
+
+        forkClosureBody, forkStatics, forkAlts, forkProc, codeOnly,
 
 	ConTagZ,
 
@@ -69,12 +76,12 @@ import VarEnv
 import OrdList
 import Unique
 import UniqSupply
-import FastString(sLit)
+import FastString
 import Outputable
 
 import Control.Monad
 import Data.List
-import Prelude hiding( sequence )
+import Prelude hiding( sequence, succ )
 import qualified Prelude( sequence )
 
 infixr 9 `thenC`	-- Right-associative!
@@ -95,12 +102,12 @@ instance Monad FCode where
 {-# INLINE thenFC #-}
 {-# INLINE returnFC #-}
 
-initC :: DynFlags -> Module -> FCode a -> IO a
-initC dflags mod (FCode code)
-  = do	{ uniqs <- mkSplitUniqSupply 'c'
-	; case code (initCgInfoDown dflags mod) (initCgState uniqs) of
-	      (res, _) -> return res
-	}
+initC :: IO CgState
+initC  = do { uniqs <- mkSplitUniqSupply 'c'
+            ; return (initCgState uniqs) }
+
+runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState)
+runC dflags mod st (FCode code) = code (initCgInfoDown dflags mod) st
 
 returnFC :: a -> FCode a
 returnFC val = FCode (\_info_down state -> (val, state))
@@ -270,6 +277,8 @@ data HeapUsage =
 
 type VirtualHpOffset = WordOff
 
+
+
 initCgState :: UniqSupply -> CgState
 initCgState uniqs
   = MkCgState { cgs_stmts      = mkNop, cgs_tops = nilOL,
@@ -308,7 +317,6 @@ initHpUsage = HeapUsage { virtHp = 0, realHp = 0 }
 maxHpHw :: HeapUsage -> VirtualHpOffset -> HeapUsage
 hp_usg `maxHpHw` hw = hp_usg { virtHp = virtHp hp_usg `max` hw }
 
-
 --------------------------------------------------------
 -- Operators for getting and setting the state and "info_down".
 --------------------------------------------------------
@@ -591,6 +599,33 @@ getHeapUsage fcode
 -- ----------------------------------------------------------------------------
 -- Combinators for emitting code
 
+emitCgStmt :: CgStmt -> FCode ()
+emitCgStmt stmt
+  = do  { state <- getState
+        ; setState $ state { cgs_stmts = cgs_stmts state `snocOL` stmt }
+        }
+
+emitLabel :: BlockId -> FCode ()
+emitLabel id = emitCgStmt (CgLabel id)
+
+emitComment :: FastString -> FCode ()
+#if 0 /* def DEBUG */
+emitComment s = emitCgStmt (CgStmt (CmmComment s))
+#else
+emitComment _ = return ()
+#endif
+
+emitAssign :: CmmReg  -> CmmExpr -> FCode ()
+emitAssign l r = emitCgStmt (CgStmt (CmmAssign l r))
+
+emitStore :: CmmExpr  -> CmmExpr -> FCode ()
+emitStore l r = emitCgStmt (CgStmt (CmmStore l r))
+
+
+newLabelC :: FCode BlockId
+newLabelC = do { u <- newUnique
+               ; return $ mkBlockId u }
+
 emit :: CmmAGraph -> FCode ()
 emit ag
   = do	{ state <- getState
@@ -601,6 +636,9 @@ emitDecl decl
   = do 	{ state <- getState
 	; setState $ state { cgs_tops = cgs_tops state `snocOL` decl } }
 
+emitOutOfLine :: BlockId -> CmmAGraph -> FCode ()
+emitOutOfLine l stmts = emitCgStmt (CgFork l stmts)
+
 emitProcWithConvention :: Convention -> CmmInfoTable -> CLabel -> [CmmFormal] ->
                           CmmAGraph -> FCode ()
 emitProcWithConvention conv info lbl args blocks
@@ -629,6 +667,60 @@ getCmm code
 	; setState $ state2 { cgs_tops = cgs_tops state1 } 
         ; return (fromOL (cgs_tops state2)) }
 
+
+mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThenElse e tbranch fbranch = do
+  endif <- newLabelC
+  tid   <- newLabelC
+  fid   <- newLabelC
+  return $ mkCbranch e tid fid <*>
+            mkLabel tid <*> tbranch <*> mkBranch endif <*>
+            mkLabel fid <*> fbranch <*> mkLabel endif
+
+mkCmmIfGoto :: CmmExpr -> BlockId -> FCode CmmAGraph
+mkCmmIfGoto e tid = do
+  endif <- newLabelC
+  return $ mkCbranch e tid endif <*> mkLabel endif
+
+mkCmmIfThen :: CmmExpr -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThen e tbranch = do
+  endif <- newLabelC
+  tid <- newLabelC
+  return $ mkCbranch e tid endif <*>
+         mkLabel tid <*> tbranch <*> mkLabel endif
+
+
+mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual]
+       -> UpdFrameOffset -> (ByteOff,[(CmmExpr,ByteOff)]) -> FCode CmmAGraph
+mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do
+  k <- newLabelC
+  let area = Young k
+      (off, copyin) = copyInOflow retConv area results
+      copyout = mkCallReturnsTo f callConv actuals k off updfr_off extra_stack
+  return (copyout <*> mkLabel k <*> copyin)
+
+mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset
+          -> FCode CmmAGraph
+mkCmmCall f results actuals updfr_off
+   = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off (0,[])
+
+
+mkSafeCall :: ForeignTarget -> [CmmFormal] -> [CmmActual]
+           -> UpdFrameOffset -> Bool
+           -> FCode CmmAGraph
+mkSafeCall   t fs as upd i = do
+  k <- newLabelC
+  let (_off, copyout) = copyInOflow NativeReturn (Young k) fs
+    -- see Note [safe foreign call convention]
+  return
+     (    mkStore (CmmStackSlot (Young k) (widthInBytes wordWidth))
+                  (CmmLit (CmmBlock k))
+      <*> mkLast (CmmForeignCall { tgt=t, res=fs, args=as, succ=k
+                                 , updfr=upd, intrbl=i })
+      <*> mkLabel k
+      <*> copyout
+     )
+
 -- ----------------------------------------------------------------------------
 -- CgStmts
 
@@ -640,4 +732,3 @@ cgStmtsToBlocks :: CmmAGraph -> FCode CmmGraph
 cgStmtsToBlocks stmts
   = do  { us <- newUniqSupply
 	; return (initUs_ us (lgraphOfAGraph stmts)) }	
-
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index efa234b5a6..bd783a3b30 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -14,7 +14,9 @@
 -- for details
 
 module StgCmmPrim (
-   cgOpApp
+   cgOpApp,
+   cgPrimOp -- internal(ish), used by cgCase to get code for a
+            -- comparison without also turning it into a Bool.
  ) where
 
 #include "HsVersions.h"
@@ -67,14 +69,9 @@ cgOpApp :: StgOp	-- The op
 
 -- Foreign calls 
 cgOpApp (StgFCallOp fcall _) stg_args res_ty 
-  = do	{ (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
-	 	-- Choose result regs r1, r2
-		-- Note [Foreign call results]
-	; cgForeignCall res_regs res_hints fcall stg_args
-		-- r1, r2 = foo( x, y )
-	; emitReturn (map (CmmReg . CmmLocal) res_regs) }
-		-- return (r1, r2) 
-      
+  = cgForeignCall fcall stg_args res_ty
+      -- Note [Foreign call results]
+
 -- tagToEnum# is special: we need to pull the constructor 
 -- out of the table, and perform an appropriate return.
 
@@ -229,23 +226,23 @@ emitPrimOp [res] SparkOp [arg]
             [(tmp2,NoHint)]
             (CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "newSpark"))))
             [(CmmReg (CmmGlobal BaseReg), AddrHint), ((CmmReg (CmmLocal tmp)), AddrHint)]
-        emit (mkAssign (CmmLocal res) (CmmReg (CmmLocal tmp)))
+        emitAssign (CmmLocal res) (CmmReg (CmmLocal tmp))
 
 emitPrimOp [res] GetCCSOfOp [arg]
-  = emit (mkAssign (CmmLocal res) val)
+  = emitAssign (CmmLocal res) val
   where
     val | opt_SccProfilingOn = costCentreFrom (cmmUntag arg)
         | otherwise          = CmmLit zeroCLit
 
 emitPrimOp [res] GetCurrentCCSOp [_dummy_arg]
-   = emit (mkAssign (CmmLocal res) curCCS)
+   = emitAssign (CmmLocal res) curCCS
 
 emitPrimOp [res] ReadMutVarOp [mutv]
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexW mutv fixedHdrSize gcWord))
+   = emitAssign (CmmLocal res) (cmmLoadIndexW mutv fixedHdrSize gcWord)
 
 emitPrimOp [] WriteMutVarOp [mutv,var]
    = do
-	emit (mkStore (cmmOffsetW mutv fixedHdrSize) var)
+        emitStore (cmmOffsetW mutv fixedHdrSize) var
 	emitCCall
 		[{-no results-}]
 		(CmmLit (CmmLabel mkDirty_MUT_VAR_Label))
@@ -269,32 +266,32 @@ emitPrimOp res@[] TouchOp args@[_arg]
 
 --  #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a)
 emitPrimOp [res] ByteArrayContents_Char [arg]
-   = emit (mkAssign (CmmLocal res) (cmmOffsetB arg arrWordsHdrSize))
+   = emitAssign (CmmLocal res) (cmmOffsetB arg arrWordsHdrSize)
 
 --  #define stableNameToIntzh(r,s)   (r = ((StgStableName *)s)->sn)
 emitPrimOp [res] StableNameToIntOp [arg]
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexW arg fixedHdrSize bWord))
+   = emitAssign (CmmLocal res) (cmmLoadIndexW arg fixedHdrSize bWord)
 
 --  #define eqStableNamezh(r,sn1,sn2)					\
 --    (r = (((StgStableName *)sn1)->sn == ((StgStableName *)sn2)->sn))
 emitPrimOp [res] EqStableNameOp [arg1,arg2]
-   = emit (mkAssign (CmmLocal res) (CmmMachOp mo_wordEq [
+   = emitAssign (CmmLocal res) (CmmMachOp mo_wordEq [
 				cmmLoadIndexW arg1 fixedHdrSize bWord,
 				cmmLoadIndexW arg2 fixedHdrSize bWord
-			 ]))
+                         ])
 
 
 emitPrimOp [res] ReallyUnsafePtrEqualityOp [arg1,arg2]
-   = emit (mkAssign (CmmLocal res) (CmmMachOp mo_wordEq [arg1,arg2]))
+   = emitAssign (CmmLocal res) (CmmMachOp mo_wordEq [arg1,arg2])
 
 --  #define addrToHValuezh(r,a) r=(P_)a
 emitPrimOp [res] AddrToAnyOp [arg]
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
 --  #define dataToTagzh(r,a)  r=(GET_TAG(((StgClosure *)a)->header.info))
 --  Note: argument may be tagged!
 emitPrimOp [res] DataToTagOp [arg]
-   = emit (mkAssign (CmmLocal res) (getConstrTag (cmmUntag arg)))
+   = emitAssign (CmmLocal res) (getConstrTag (cmmUntag arg))
 
 {- Freezing arrays-of-ptrs requires changing an info table, for the
    benefit of the generational collector.  It needs to scavenge mutable
@@ -317,7 +314,7 @@ emitPrimOp [res] UnsafeFreezeArrayArrayOp [arg]
 
 --  #define unsafeFreezzeByteArrayzh(r,a)	r=(a)
 emitPrimOp [res] UnsafeFreezeByteArrayOp [arg]
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
 -- Copying pointer arrays
 
@@ -497,11 +494,11 @@ emitPrimOp [res] PopCntOp [w] = emitPopCntCall res w wordWidth
 -- The rest just translate straightforwardly
 emitPrimOp [res] op [arg]
    | nopOp op
-   = emit (mkAssign (CmmLocal res) arg)
+   = emitAssign (CmmLocal res) arg
 
    | Just (mop,rep) <- narrowOp op
-   = emit (mkAssign (CmmLocal res) $
-	   CmmMachOp (mop rep wordWidth) [CmmMachOp (mop wordWidth rep) [arg]])
+   = emitAssign (CmmLocal res) $
+           CmmMachOp (mop rep wordWidth) [CmmMachOp (mop wordWidth rep) [arg]]
 
 emitPrimOp r@[res] op args
    | Just prim <- callishOp op
@@ -746,15 +743,15 @@ loadArrPtrsSize addr = CmmLoad (cmmOffsetB addr off) bWord
 mkBasicIndexedRead :: ByteOff -> Maybe MachOp -> CmmType
 		   -> LocalReg -> CmmExpr -> CmmExpr -> FCode ()
 mkBasicIndexedRead off Nothing read_rep res base idx
-   = emit (mkAssign (CmmLocal res) (cmmLoadIndexOffExpr off read_rep base idx))
+   = emitAssign (CmmLocal res) (cmmLoadIndexOffExpr off read_rep base idx)
 mkBasicIndexedRead off (Just cast) read_rep res base idx
-   = emit (mkAssign (CmmLocal res) (CmmMachOp cast [
-				cmmLoadIndexOffExpr off read_rep base idx]))
+   = emitAssign (CmmLocal res) (CmmMachOp cast [
+                                cmmLoadIndexOffExpr off read_rep base idx])
 
 mkBasicIndexedWrite :: ByteOff -> Maybe MachOp
 		   -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
 mkBasicIndexedWrite off Nothing base idx val
-   = emit (mkStore (cmmIndexOffExpr off (typeWidth (cmmExprType val)) base idx) val)
+   = emitStore (cmmIndexOffExpr off (typeWidth (cmmExprType val)) base idx) val
 mkBasicIndexedWrite off (Just cast) base idx val
    = mkBasicIndexedWrite off Nothing base idx (CmmMachOp cast [val])
 
@@ -805,7 +802,7 @@ doCopyMutableByteArrayOp = emitCopyByteArray copy
             getCode $ emitMemmoveCall dst_p src_p bytes (CmmLit (mkIntCLit 1)),
             getCode $ emitMemcpyCall  dst_p src_p bytes (CmmLit (mkIntCLit 1))
             ]
-        emit $ mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
+        emit =<< mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
 
 emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                       -> FCode ())
@@ -875,7 +872,7 @@ doCopyMutableArrayOp = emitCopyArray copy
             getCode $ emitMemmoveCall dst_p src_p bytes (CmmLit (mkIntCLit wORD_SIZE)),
             getCode $ emitMemcpyCall  dst_p src_p bytes (CmmLit (mkIntCLit wORD_SIZE))
             ]
-        emit $ mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
+        emit =<< mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
 
 emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                   -> FCode ())
diff --git a/compiler/codeGen/StgCmmProf.hs b/compiler/codeGen/StgCmmProf.hs
index 6a53317385..9ff4d0be07 100644
--- a/compiler/codeGen/StgCmmProf.hs
+++ b/compiler/codeGen/StgCmmProf.hs
@@ -103,7 +103,7 @@ initUpdFrameProf :: CmmExpr -> FCode ()
 -- Initialise the profiling field of an update frame
 initUpdFrameProf frame_amode 
   = ifProfiling $	-- frame->header.prof.ccs = CCCS
-    emit (mkStore (cmmOffsetB frame_amode oFFSET_StgHeader_ccs) curCCS)
+    emitStore (cmmOffsetB frame_amode oFFSET_StgHeader_ccs) curCCS
 	-- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0) 
 	-- is unnecessary because it is not used anyhow.
 
@@ -143,7 +143,7 @@ saveCurrentCostCentre
   = return Nothing
   | otherwise
   = do	{ local_cc <- newTemp ccType
-	; emit (mkAssign (CmmLocal local_cc) curCCS)
+        ; emitAssign (CmmLocal local_cc) curCCS
 	; return (Just local_cc) }
 
 restoreCurrentCostCentre :: Maybe LocalReg -> FCode ()
@@ -338,9 +338,9 @@ ldvEnter cl_ptr
      -- if (era > 0) {
      --    LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) |
      --                era | LDV_STATE_USE }
-    emit (mkCmmIfThenElse (CmmMachOp mo_wordUGt [loadEra, CmmLit zeroCLit])
+    emit =<< mkCmmIfThenElse (CmmMachOp mo_wordUGt [loadEra, CmmLit zeroCLit])
 		(mkStore ldv_wd new_ldv_wd)
-		mkNop)
+                mkNop
   where
         -- don't forget to substract node's tag
     ldv_wd = ldvWord cl_ptr
diff --git a/compiler/codeGen/StgCmmTicky.hs b/compiler/codeGen/StgCmmTicky.hs
index d0432315ab..698bf32709 100644
--- a/compiler/codeGen/StgCmmTicky.hs
+++ b/compiler/codeGen/StgCmmTicky.hs
@@ -180,7 +180,7 @@ registerTickyCtr :: CLabel -> FCode ()
 --	    ticky_entry_ctrs = & (f_ct);	/* mark it as "registered" */
 --	    f_ct.registeredp = 1 }
 registerTickyCtr ctr_lbl
-  = emit (mkCmmIfThen test (catAGraphs register_stmts))
+  = emit =<< mkCmmIfThen test (catAGraphs register_stmts)
   where
     -- krc: code generator doesn't handle Not, so we test for Eq 0 instead
     test = CmmMachOp (MO_Eq wordWidth)
@@ -352,7 +352,7 @@ bumpHistogram _lbl _n
 bumpHistogramE :: LitString -> CmmExpr -> FCode ()
 bumpHistogramE lbl n 
   = do  t <- newTemp cLong
-	emit (mkAssign (CmmLocal t) n)
+        emitAssign (CmmLocal t) n
 	emit (mkCmmIfThen (CmmMachOp (MO_U_Le cLongWidth) [CmmReg (CmmLocal t), eight])
 	  		  (mkAssign (CmmLocal t) eight))
 	emit (addToMem cLong
diff --git a/compiler/codeGen/StgCmmUtils.hs b/compiler/codeGen/StgCmmUtils.hs
index dda2260a04..273e59b0b5 100644
--- a/compiler/codeGen/StgCmmUtils.hs
+++ b/compiler/codeGen/StgCmmUtils.hs
@@ -18,12 +18,11 @@ module StgCmmUtils (
 	emitDataLits, mkDataLits,
         emitRODataLits, mkRODataLits,
         emitRtsCall, emitRtsCallWithVols, emitRtsCallWithResult, emitRtsCallGen,
-        assignTemp, newTemp, withTemp,
+        assignTemp, newTemp,
 
 	newUnboxedTupleRegs,
 
-	mkMultiAssign, mkCmmSwitch, mkCmmLitSwitch,
-	emitSwitch,
+        emitMultiAssign, emitCmmLitSwitch, emitSwitch,
 
 	tagToClosure, mkTaggedObjectLoad,
 
@@ -72,6 +71,7 @@ import Module
 import Literal
 import Digraph
 import ListSetOps
+import VarSet
 import Util
 import Unique
 import DynFlags
@@ -79,6 +79,8 @@ import FastString
 import Outputable
 
 import Data.Char
+import Data.List
+import Data.Ord
 import Data.Word
 import Data.Maybe
 
@@ -202,14 +204,14 @@ emitRtsCallGen
 emitRtsCallGen res pkg fun args _vols safe
   = do { updfr_off <- getUpdFrameOff
        ; emit caller_save
-       ; emit $ call updfr_off
+       ; call updfr_off
        ; emit caller_load }
   where
     call updfr_off =
       if safe then
-        mkCmmCall fun_expr res' args' updfr_off
+        emit =<< mkCmmCall fun_expr res' args' updfr_off
       else
-        mkUnsafeCall (ForeignTarget fun_expr
+        emit $ mkUnsafeCall (ForeignTarget fun_expr
                          (ForeignConvention CCallConv arg_hints res_hints)) res' args'
     (args', arg_hints) = unzip args
     (res',  res_hints) = unzip res
@@ -439,7 +441,7 @@ assignTemp :: CmmExpr -> FCode LocalReg
 assignTemp (CmmReg (CmmLocal reg)) = return reg
 assignTemp e = do { uniq <- newUnique
 		  ; let reg = LocalReg uniq (cmmExprType e)
-		  ; emit (mkAssign (CmmLocal reg) e)
+                  ; emitAssign (CmmLocal reg) e
 		  ; return reg }
 
 newTemp :: CmmType -> FCode LocalReg
@@ -469,10 +471,10 @@ newUnboxedTupleRegs res_ty
 
 
 -------------------------------------------------------------------------
---	mkMultiAssign
+--      emitMultiAssign
 -------------------------------------------------------------------------
 
-mkMultiAssign :: [LocalReg] -> [CmmExpr] -> CmmAGraph
+emitMultiAssign :: [LocalReg] -> [CmmExpr] -> FCode ()
 -- Emit code to perform the assignments in the
 -- input simultaneously, using temporary variables when necessary.
 
@@ -487,14 +489,13 @@ type Stmt = (LocalReg, CmmExpr)	-- r := e
 --		s1 assigns to something s2 uses
 --	  that is, if s1 should *follow* s2 in the final order
 
-mkMultiAssign []    []    = mkNop
-mkMultiAssign [reg] [rhs] = mkAssign (CmmLocal reg) rhs
-mkMultiAssign regs  rhss  = ASSERT( equalLength regs rhss )
-			    unscramble ([1..] `zip` (regs `zip` rhss))
+emitMultiAssign []    []    = return ()
+emitMultiAssign [reg] [rhs] = emitAssign (CmmLocal reg) rhs
+emitMultiAssign regs  rhss  = ASSERT( equalLength regs rhss )
+                              unscramble ([1..] `zip` (regs `zip` rhss))
 
-unscramble :: [Vrtx] -> CmmAGraph
-unscramble vertices
-  = catAGraphs (map do_component components)
+unscramble :: [Vrtx] -> FCode ()
+unscramble vertices = mapM_ do_component components
   where
 	edges :: [ (Vrtx, Key, [Key]) ]
 	edges = [ (vertex, key1, edges_from stmt1)
@@ -509,19 +510,19 @@ unscramble vertices
 
 	-- do_components deal with one strongly-connected component
 	-- Not cyclic, or singleton?  Just do it
-	do_component :: SCC Vrtx -> CmmAGraph
-	do_component (AcyclicSCC (_,stmt))  = mk_graph stmt
+        do_component :: SCC Vrtx -> FCode ()
+        do_component (AcyclicSCC (_,stmt))  = mk_graph stmt
 	do_component (CyclicSCC []) 	    = panic "do_component"
 	do_component (CyclicSCC [(_,stmt)]) = mk_graph stmt
 
 		-- Cyclic?  Then go via temporaries.  Pick one to
 		-- break the loop and try again with the rest.
-	do_component (CyclicSCC ((_,first_stmt) : rest))
-	  = withUnique 		$ \u -> 
+        do_component (CyclicSCC ((_,first_stmt) : rest)) = do
+            u <- newUnique
 	    let (to_tmp, from_tmp) = split u first_stmt
-	    in mk_graph to_tmp
-	       <*> unscramble rest
-	       <*> mk_graph from_tmp
+            mk_graph to_tmp
+            unscramble rest
+            mk_graph from_tmp
 
 	split :: Unique -> Stmt -> (Stmt, Stmt)
 	split uniq (reg, rhs)
@@ -530,8 +531,8 @@ unscramble vertices
 	    rep = cmmExprType rhs
 	    tmp = LocalReg uniq rep
 
-	mk_graph :: Stmt -> CmmAGraph
-	mk_graph (reg, rhs) = mkAssign (CmmLocal reg) rhs
+        mk_graph :: Stmt -> FCode ()
+        mk_graph (reg, rhs) = emitAssign (CmmLocal reg) rhs
 
 mustFollow :: Stmt -> Stmt -> Bool
 (reg, _) `mustFollow` (_, rhs) = CmmLocal reg `regUsedIn` rhs
@@ -549,7 +550,7 @@ emitSwitch :: CmmExpr  		-- Tag to switch on
 	   -> FCode ()
 emitSwitch tag_expr branches mb_deflt lo_tag hi_tag
   = do	{ dflags <- getDynFlags
-	; emit (mkCmmSwitch (via_C dflags) tag_expr branches mb_deflt lo_tag hi_tag) }
+        ; mkCmmSwitch (via_C dflags) tag_expr branches mb_deflt lo_tag hi_tag }
   where
     via_C dflags | HscC <- hscTarget dflags = True
 		 | otherwise                = False
@@ -561,41 +562,40 @@ mkCmmSwitch :: Bool			-- True <=> never generate a conditional tree
 	    -> Maybe CmmAGraph	    	-- Default branch (if any)
 	    -> ConTagZ -> ConTagZ	-- Min and Max possible values; behaviour
 	    			        -- 	outside this range is undefined
-	    -> CmmAGraph
+            -> FCode ()
 
 -- First, two rather common cases in which there is no work to do
-mkCmmSwitch _ _ []         (Just code) _ _ = code
-mkCmmSwitch _ _ [(_,code)] Nothing     _ _ = code
+mkCmmSwitch _ _ []         (Just code) _ _ = emit code
+mkCmmSwitch _ _ [(_,code)] Nothing     _ _ = emit code
 
 -- Right, off we go
-mkCmmSwitch via_C tag_expr branches mb_deflt lo_tag hi_tag
-  = withFreshLabel "switch join" 	$ \ join_lbl ->
-    label_default join_lbl mb_deflt	$ \ mb_deflt ->
-    label_branches join_lbl branches	$ \ branches ->
-    assignTemp' tag_expr		$ \tag_expr' -> 
+mkCmmSwitch via_C tag_expr branches mb_deflt lo_tag hi_tag = do
+    join_lbl      <- newLabelC
+    mb_deflt_lbl  <- label_default join_lbl mb_deflt
+    branches_lbls <- label_branches join_lbl branches
+    tag_expr'     <- assignTemp' tag_expr
     
-    mk_switch tag_expr' (sortLe le branches) mb_deflt 
-	      lo_tag hi_tag via_C
-	  -- Sort the branches before calling mk_switch
-    <*> mkLabel join_lbl
+    emit =<< mk_switch tag_expr' (sortBy (comparing fst) branches_lbls)
+                mb_deflt_lbl lo_tag hi_tag via_C
 
-  where
-    (t1,_) `le` (t2,_) = t1 <= t2
+          -- Sort the branches before calling mk_switch
+
+    emitLabel join_lbl
 
 mk_switch :: CmmExpr -> [(ConTagZ, BlockId)]
 	  -> Maybe BlockId 
 	  -> ConTagZ -> ConTagZ -> Bool
-	  -> CmmAGraph
+          -> FCode CmmAGraph
 
 -- SINGLETON TAG RANGE: no case analysis to do
 mk_switch _tag_expr [(tag, lbl)] _ lo_tag hi_tag _via_C
   | lo_tag == hi_tag
   = ASSERT( tag == lo_tag )
-    mkBranch lbl
+    return (mkBranch lbl)
 
 -- SINGLETON BRANCH, NO DEFAULT: no case analysis to do
 mk_switch _tag_expr [(_tag,lbl)] Nothing _ _ _
-  = mkBranch lbl
+  = return (mkBranch lbl)
 	-- The simplifier might have eliminated a case
 	-- 	 so we may have e.g. case xs of 
 	--				 [] -> e
@@ -604,7 +604,7 @@ mk_switch _tag_expr [(_tag,lbl)] Nothing _ _ _
 
 -- SINGLETON BRANCH: one equality check to do
 mk_switch tag_expr [(tag,lbl)] (Just deflt) _ _ _
-  = mkCbranch cond deflt lbl
+  = return (mkCbranch cond deflt lbl)
   where
     cond =  cmmNeWord tag_expr (CmmLit (mkIntCLit tag))
 	-- We have lo_tag < hi_tag, but there's only one branch, 
@@ -637,30 +637,34 @@ mk_switch tag_expr branches mb_deflt lo_tag hi_tag via_C
 	arms :: [Maybe BlockId]
 	arms = [ find_branch i | i <- [real_lo_tag..real_hi_tag]]
     in
-    mkSwitch (cmmOffset tag_expr (- real_lo_tag)) arms
+    return (mkSwitch (cmmOffset tag_expr (- real_lo_tag)) arms)
 
   -- if we can knock off a bunch of default cases with one if, then do so
   | Just deflt <- mb_deflt, (lowest_branch - lo_tag) >= n_branches
-  = mkCmmIfThenElse 
+  = do stmts <- mk_switch tag_expr branches mb_deflt
+                        lowest_branch hi_tag via_C
+       mkCmmIfThenElse
 	(cmmULtWord tag_expr (CmmLit (mkIntCLit lowest_branch)))
 	(mkBranch deflt)
-	(mk_switch tag_expr branches mb_deflt 
-			lowest_branch hi_tag via_C)
+        stmts
 
   | Just deflt <- mb_deflt, (hi_tag - highest_branch) >= n_branches
-  = mkCmmIfThenElse 
+  = do stmts <- mk_switch tag_expr branches mb_deflt
+                        lo_tag highest_branch via_C
+       mkCmmIfThenElse
 	(cmmUGtWord tag_expr (CmmLit (mkIntCLit highest_branch)))
 	(mkBranch deflt)
-	(mk_switch tag_expr branches mb_deflt 
-			lo_tag highest_branch via_C)
+        stmts
 
   | otherwise	-- Use an if-tree
-  = mkCmmIfThenElse 
+  = do lo_stmts <- mk_switch tag_expr lo_branches mb_deflt
+                             lo_tag (mid_tag-1) via_C
+       hi_stmts <- mk_switch tag_expr hi_branches mb_deflt
+                             mid_tag hi_tag via_C
+       mkCmmIfThenElse
 	(cmmUGeWord tag_expr (CmmLit (mkIntCLit mid_tag)))
-	(mk_switch tag_expr hi_branches mb_deflt 
-			     mid_tag hi_tag via_C)
-	(mk_switch tag_expr lo_branches mb_deflt 
-			     lo_tag (mid_tag-1) via_C)
+        hi_stmts
+        lo_stmts
 	-- we test (e >= mid_tag) rather than (e < mid_tag), because
 	-- the former works better when e is a comparison, and there
 	-- are two tags 0 & 1 (mid_tag == 1).  In this case, the code
@@ -715,32 +719,31 @@ mk_switch tag_expr branches mb_deflt lo_tag hi_tag via_C
     is_lo (t,_) = t < mid_tag
 
 --------------
-mkCmmLitSwitch :: CmmExpr		  -- Tag to switch on
+emitCmmLitSwitch :: CmmExpr               -- Tag to switch on
 	       -> [(Literal, CmmAGraph)]  -- Tagged branches
 	       -> CmmAGraph		  -- Default branch (always)
-	       -> CmmAGraph		  -- Emit the code
+               -> FCode ()                -- Emit the code
 -- Used for general literals, whose size might not be a word, 
 -- where there is always a default case, and where we don't know
 -- the range of values for certain.  For simplicity we always generate a tree.
 --
 -- ToDo: for integers we could do better here, perhaps by generalising
 -- mk_switch and using that.  --SDM 15/09/2004
-mkCmmLitSwitch _scrut []       deflt = deflt
-mkCmmLitSwitch scrut  branches deflt
-  = assignTemp' scrut		$ \ scrut' ->
-    withFreshLabel "switch join" 	$ \ join_lbl ->
-    label_code join_lbl deflt		$ \ deflt ->
-    label_branches join_lbl branches	$ \ branches ->
-    mk_lit_switch scrut' deflt (sortLe le branches)
-    <*> mkLabel join_lbl
-  where
-    le (t1,_) (t2,_) = t1 <= t2
+emitCmmLitSwitch _scrut []       deflt = emit deflt
+emitCmmLitSwitch scrut  branches deflt = do
+    scrut' <- assignTemp' scrut
+    join_lbl <- newLabelC
+    deflt_lbl <- label_code join_lbl deflt
+    branches_lbls <- label_branches join_lbl branches
+    emit =<< mk_lit_switch scrut' deflt_lbl
+               (sortBy (comparing fst) branches_lbls)
+    emitLabel join_lbl
 
 mk_lit_switch :: CmmExpr -> BlockId 
  	      -> [(Literal,BlockId)]
-	      -> CmmAGraph
+              -> FCode CmmAGraph
 mk_lit_switch scrut deflt [(lit,blk)] 
-  = mkCbranch (CmmMachOp ne [scrut, CmmLit cmm_lit]) deflt blk
+  = return (mkCbranch (CmmMachOp ne [scrut, CmmLit cmm_lit]) deflt blk)
   where
     cmm_lit = mkSimpleLit lit
     cmm_ty  = cmmLitType cmm_lit
@@ -748,9 +751,9 @@ mk_lit_switch scrut deflt [(lit,blk)]
     ne      = if isFloatType cmm_ty then MO_F_Ne rep else MO_Ne rep
 
 mk_lit_switch scrut deflt_blk_id branches
-  = mkCmmIfThenElse cond
-	(mk_lit_switch scrut deflt_blk_id lo_branches)
-	(mk_lit_switch scrut deflt_blk_id hi_branches)
+  = do lo_blk <- mk_lit_switch scrut deflt_blk_id lo_branches
+       hi_blk <- mk_lit_switch scrut deflt_blk_id hi_branches
+       mkCmmIfThenElse cond lo_blk hi_blk
   where
     n_branches = length branches
     (mid_lit,_) = branches !! (n_branches `div` 2)
@@ -764,49 +767,42 @@ mk_lit_switch scrut deflt_blk_id branches
 
 
 --------------
-label_default :: BlockId -> Maybe CmmAGraph
-	      -> (Maybe BlockId -> CmmAGraph)
-	      -> CmmAGraph
-label_default _ Nothing thing_inside 
-  = thing_inside Nothing
-label_default join_lbl (Just code) thing_inside 
-  = label_code join_lbl code 	$ \ lbl ->
-    thing_inside (Just lbl)
+label_default :: BlockId -> Maybe CmmAGraph -> FCode (Maybe BlockId)
+label_default _ Nothing
+  = return  Nothing
+label_default join_lbl (Just code)
+  = do lbl <- label_code join_lbl code
+       return (Just lbl)
 
 --------------
-label_branches :: BlockId -> [(a,CmmAGraph)]
-	       -> ([(a,BlockId)] -> CmmAGraph) 
-	       -> CmmAGraph
-label_branches _join_lbl [] thing_inside 
-  = thing_inside []
-label_branches join_lbl ((tag,code):branches) thing_inside
-  = label_code join_lbl code		$ \ lbl ->
-    label_branches join_lbl branches 	$ \ branches' ->
-    thing_inside ((tag,lbl):branches')
+label_branches :: BlockId -> [(a,CmmAGraph)] -> FCode [(a,BlockId)]
+label_branches _join_lbl []
+  = return []
+label_branches join_lbl ((tag,code):branches)
+  = do lbl <- label_code join_lbl code
+       branches' <- label_branches join_lbl branches
+       return ((tag,lbl):branches')
 
 --------------
-label_code :: BlockId -> CmmAGraph -> (BlockId -> CmmAGraph) -> CmmAGraph
--- (label_code J code fun)
+label_code :: BlockId -> CmmAGraph -> FCode BlockId
+--  label_code J code
 --	generates
---  [L: code; goto J] fun L
-label_code join_lbl code thing_inside
-  = withFreshLabel "switch" 	$ \lbl -> 
-    outOfLine (mkLabel lbl <*> code <*> mkBranch join_lbl)
-    <*> thing_inside lbl
- 
+--  [L: code; goto J]
+-- and returns L
+label_code join_lbl code = do
+    lbl <- newLabelC
+    emitOutOfLine lbl (code <*> mkBranch join_lbl)
+    return lbl
 
 --------------
-assignTemp' :: CmmExpr -> (CmmExpr -> CmmAGraph) -> CmmAGraph
-assignTemp' e thing_inside
-  | isTrivialCmmExpr e = thing_inside e
-  | otherwise          = withTemp (cmmExprType e)	$ \ lreg ->
-			 let reg = CmmLocal lreg in 
-			 mkAssign reg e <*> thing_inside (CmmReg reg)
-
-withTemp :: CmmType -> (LocalReg -> CmmAGraph) -> CmmAGraph
-withTemp rep thing_inside
-  = withUnique $ \uniq -> thing_inside (LocalReg uniq rep)
-
+assignTemp' :: CmmExpr -> FCode CmmExpr
+assignTemp' e
+  | isTrivialCmmExpr e = return e
+  | otherwise = do
+       lreg <- newTemp (cmmExprType e)
+       let reg = CmmLocal lreg
+       emitAssign reg e
+       return (CmmReg reg)
 
 -------------------------------------------------------------------------
 --
@@ -814,36 +810,13 @@ withTemp rep thing_inside
 --
 -------------------------------------------------------------------------
 
--- There is just one SRT for each top level binding; all the nested
--- bindings use sub-sections of this SRT.  The label is passed down to
--- the nested bindings via the monad.
-
-getSRTInfo :: SRT -> FCode C_SRT
-getSRTInfo (SRTEntries {}) = panic "getSRTInfo"
-
-getSRTInfo (SRT off len bmp)
-  | len > hALF_WORD_SIZE_IN_BITS || bmp == [fromIntegral srt_escape]
-  = do 	{ id <- newUnique
-	-- ; top_srt <- getSRTLabel
-        ; let srt_desc_lbl = mkLargeSRTLabel id
-        -- JD: We're not constructing and emitting SRTs in the back end,
-        -- which renders this code wrong (it now names a now-non-existent label).
-	-- ; emitRODataLits srt_desc_lbl
-        --      ( cmmLabelOffW top_srt off
-	--        : mkWordCLit (fromIntegral len)
-	--        : map mkWordCLit bmp)
-	; return (C_SRT srt_desc_lbl 0 srt_escape) }
-
-  | otherwise
-  = do	{ top_srt <- getSRTLabel
-	; return (C_SRT top_srt off (fromIntegral (head bmp))) }
-	-- The fromIntegral converts to StgHalfWord
-
-getSRTInfo NoSRT 
-  = -- TODO: Should we panic in this case?
-    -- Someone obviously thinks there should be an SRT
-    return NoC_SRT
-
+-- | Returns 'True' if there is a non-empty SRT, or 'False' otherwise
+-- NB. the SRT attached to an StgBind is still used in the new codegen
+-- to decide whether we need a static link field on a static closure
+-- or not.
+getSRTInfo :: SRT -> FCode Bool
+getSRTInfo (SRTEntries vs) = return (not (isEmptyVarSet vs))
+getSRTInfo _               = return False
 
 srt_escape :: StgHalfWord
 srt_escape = -1
diff --git a/compiler/coreSyn/CoreFVs.lhs b/compiler/coreSyn/CoreFVs.lhs
index eb3cd5e948..d2bb6ed57a 100644
--- a/compiler/coreSyn/CoreFVs.lhs
+++ b/compiler/coreSyn/CoreFVs.lhs
@@ -442,7 +442,7 @@ stableUnfoldingVars fv_cand unf
   = case unf of
       CoreUnfolding { uf_tmpl = rhs, uf_src = src }
          | isStableSource src -> Just (exprSomeFreeVars fv_cand rhs)
-      DFunUnfolding _ _ args  -> Just (exprsSomeFreeVars fv_cand args)
+      DFunUnfolding _ _ args  -> Just (exprsSomeFreeVars fv_cand (dfunArgExprs args))
       _other                  -> Nothing
 \end{code}
 
diff --git a/compiler/coreSyn/CoreSubst.lhs b/compiler/coreSyn/CoreSubst.lhs
index 16173fb332..a8de9c2b16 100644
--- a/compiler/coreSyn/CoreSubst.lhs
+++ b/compiler/coreSyn/CoreSubst.lhs
@@ -658,7 +658,7 @@ substUnfoldingSC subst unf 	 -- Short-cut version
 substUnfolding subst (DFunUnfolding ar con args)
   = DFunUnfolding ar con (map subst_arg args)
   where
-    subst_arg = substExpr (text "dfun-unf") subst
+    subst_arg = fmap (substExpr (text "dfun-unf") subst)
 
 substUnfolding subst unf@(CoreUnfolding { uf_tmpl = tmpl, uf_src = src })
 	-- Retain an InlineRule!
@@ -1194,7 +1194,8 @@ exprIsConApp_maybe id_unf expr
         , length args == dfun_nargs    -- See Note [DFun arity check]
         , let (dfun_tvs, _n_theta, _cls, dfun_res_tys) = tcSplitDFunTy (idType fun)
               subst    = zipOpenTvSubst dfun_tvs (stripTypeArgs (takeList dfun_tvs args))
-              mk_arg e = mkApps e args
+              mk_arg (DFunPolyArg e) = mkApps e args
+              mk_arg (DFunLamArg i)  = args !! i
         = dealWithCoercion co (con, substTys subst dfun_res_tys, map mk_arg ops)
 
         -- Look through unfoldings, but only arity-zero one; 
diff --git a/compiler/coreSyn/CoreSyn.lhs b/compiler/coreSyn/CoreSyn.lhs
index e52a6cfe45..a84a29a6c0 100644
--- a/compiler/coreSyn/CoreSyn.lhs
+++ b/compiler/coreSyn/CoreSyn.lhs
@@ -49,6 +49,7 @@ module CoreSyn (
 
         -- * Unfolding data types
         Unfolding(..),  UnfoldingGuidance(..), UnfoldingSource(..),
+        DFunArg(..), dfunArgExprs,
 
 	-- ** Constructing 'Unfolding's
 	noUnfolding, evaldUnfolding, mkOtherCon,
@@ -635,7 +636,7 @@ data Unfolding
 
         DataCon 	-- The dictionary data constructor (possibly a newtype datacon)
 
-        [CoreExpr]      -- Specification of superclasses and methods, in positional order
+        [DFunArg CoreExpr]  -- Specification of superclasses and methods, in positional order
 
   | CoreUnfolding {		-- An unfolding for an Id with no pragma, 
                                 -- or perhaps a NOINLINE pragma
@@ -673,6 +674,21 @@ data Unfolding
   --  uf_guidance:  Tells us about the /size/ of the unfolding template
 
 ------------------------------------------------
+data DFunArg e   -- Given (df a b d1 d2 d3)
+  = DFunPolyArg  e      -- Arg is (e a b d1 d2 d3)
+  | DFunLamArg   Int    -- Arg is one of [a,b,d1,d2,d3], zero indexed
+  deriving( Functor )
+
+  -- 'e' is often CoreExpr, which are usually variables, but can
+  -- be trivial expressions instead (e.g. a type application).
+
+dfunArgExprs :: [DFunArg e] -> [e]
+dfunArgExprs []                    = []
+dfunArgExprs (DFunPolyArg  e : as) = e : dfunArgExprs as
+dfunArgExprs (DFunLamArg {}  : as) = dfunArgExprs as
+
+
+------------------------------------------------
 data UnfoldingSource
   = InlineRhs          -- The current rhs of the function
     		       -- Replace uf_tmpl each time around
diff --git a/compiler/coreSyn/CoreTidy.lhs b/compiler/coreSyn/CoreTidy.lhs
index 2045538ace..e29c50cc9d 100644
--- a/compiler/coreSyn/CoreTidy.lhs
+++ b/compiler/coreSyn/CoreTidy.lhs
@@ -205,8 +205,8 @@ tidyIdBndr env@(tidy_env, var_env) id
 
 ------------ Unfolding  --------------
 tidyUnfolding :: TidyEnv -> Unfolding -> Unfolding -> Unfolding
-tidyUnfolding tidy_env (DFunUnfolding ar con ids) _
-  = DFunUnfolding ar con (map (tidyExpr tidy_env) ids)
+tidyUnfolding tidy_env (DFunUnfolding ar con args) _
+  = DFunUnfolding ar con (map (fmap (tidyExpr tidy_env)) args)
 tidyUnfolding tidy_env 
               unf@(CoreUnfolding { uf_tmpl = unf_rhs, uf_src = src })
               unf_from_rhs
diff --git a/compiler/coreSyn/CoreUnfold.lhs b/compiler/coreSyn/CoreUnfold.lhs
index 8f62ed439e..816d34e87b 100644
--- a/compiler/coreSyn/CoreUnfold.lhs
+++ b/compiler/coreSyn/CoreUnfold.lhs
@@ -96,7 +96,7 @@ mkImplicitUnfolding expr = mkTopUnfolding False (simpleOptExpr expr)
 mkSimpleUnfolding :: CoreExpr -> Unfolding
 mkSimpleUnfolding = mkUnfolding InlineRhs False False
 
-mkDFunUnfolding :: Type -> [CoreExpr] -> Unfolding
+mkDFunUnfolding :: Type -> [DFunArg CoreExpr] -> Unfolding
 mkDFunUnfolding dfun_ty ops 
   = DFunUnfolding dfun_nargs data_con ops
   where
diff --git a/compiler/coreSyn/CoreUtils.lhs b/compiler/coreSyn/CoreUtils.lhs
index c7dc1a6524..17e2966e15 100644
--- a/compiler/coreSyn/CoreUtils.lhs
+++ b/compiler/coreSyn/CoreUtils.lhs
@@ -928,7 +928,7 @@ expr_ok primop_ok other_expr
 app_ok :: (PrimOp -> Bool) -> Id -> [Expr b] -> Bool
 app_ok primop_ok fun args
   = case idDetails fun of
-      DFunId new_type ->  not new_type
+      DFunId _ new_type ->  not new_type
          -- DFuns terminate, unless the dict is implemented 
          -- with a newtype in which case they may not
 
diff --git a/compiler/coreSyn/MkCore.lhs b/compiler/coreSyn/MkCore.lhs
index 6123e0a346..410d62db7d 100644
--- a/compiler/coreSyn/MkCore.lhs
+++ b/compiler/coreSyn/MkCore.lhs
@@ -86,6 +86,8 @@ import Pair
 import Constants
 
 import Data.Char        ( ord )
+import Data.List
+import Data.Ord
 import Data.Word
 
 infixl 4 `mkCoreApp`, `mkCoreApps`
@@ -100,20 +102,15 @@ infixl 4 `mkCoreApp`, `mkCoreApps`
 \begin{code}
 sortQuantVars :: [Var] -> [Var]
 -- Sort the variables (KindVars, TypeVars, and Ids) 
--- into order: Kind, then Type, then Id
-sortQuantVars = sortLe le
+-- into order: Type, then Kind, then Id
+sortQuantVars = sortBy (comparing withCategory)
   where
-    v1 `le` v2 = case (is_tv v1, is_tv v2) of
-                   (True, False)  -> True
-                   (False, True)  -> False
-                   (True, True)   ->
-                     case (is_kv v1, is_kv v2) of
-                       (True, False) -> True
-                       (False, True) -> False
-                       _             -> v1 <= v2  -- Same family
-                   (False, False) -> v1 <= v2
-    is_tv v = isTyVar v
-    is_kv v = isKindVar v
+    withCategory v = (category v, v)
+    category :: Var -> Int
+    category v
+     | isTyVar   v = 1
+     | isKindVar v = 2
+     | otherwise   = 3
 
 -- | Bind a binding group over an expression, using a @let@ or @case@ as
 -- appropriate (see "CoreSyn#let_app_invariant")
diff --git a/compiler/coreSyn/PprCore.lhs b/compiler/coreSyn/PprCore.lhs
index 8ac0664b8b..39910c0812 100644
--- a/compiler/coreSyn/PprCore.lhs
+++ b/compiler/coreSyn/PprCore.lhs
@@ -430,6 +430,10 @@ instance Outputable Unfolding where
              | otherwise          = empty
             -- Don't print the RHS or we get a quadratic
             -- blowup in the size of the printout!
+
+instance Outputable e => Outputable (DFunArg e) where
+  ppr (DFunPolyArg e) = braces (ppr e)
+  ppr (DFunLamArg i)  = char '<' <> int i <> char '>'
 \end{code}
 
 -----------------------------------------------------
diff --git a/compiler/deSugar/Coverage.lhs b/compiler/deSugar/Coverage.lhs
index c29f39edaa..2a4486eb69 100644
--- a/compiler/deSugar/Coverage.lhs
+++ b/compiler/deSugar/Coverage.lhs
@@ -41,7 +41,6 @@ import Trace.Hpc.Mix
 import Trace.Hpc.Util
 
 import BreakArray
-import Data.HashTable   ( hashString )
 import Data.Map (Map)
 import qualified Data.Map as Map
 \end{code}
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 4371bca95e..3c13bb4704 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -185,16 +185,15 @@ Library
         CmmOpt
         CmmParse
         CmmProcPoint
-        CmmSpillReload
         CmmRewriteAssignments
-        CmmStackLayout
         CmmType
         CmmUtils
+        CmmLayoutStack
         MkGraph
         OldCmm
+        OldCmmLint
         OldCmmUtils
         OldPprCmm
-        OptimizationFuel
         PprBase
         PprC
         PprCmm
@@ -440,6 +439,7 @@ Library
         Pretty
         Serialized
         State
+        Stream
         StringBuffer
         UniqFM
         UniqSet
@@ -473,6 +473,8 @@ Library
         Vectorise.Env
         Vectorise.Exp
         Vectorise
+        Hoopl.Dataflow
+        Hoopl
 
     Exposed-Modules:
             AsmCodeGen
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index a3005db41b..1ea6159812 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -310,6 +310,9 @@ ifeq "$(GhcWithInterpreter)" "YES"
 compiler_stage2_CONFIGURE_OPTS += --flags=ghci
 
 ifeq "$(BuildSharedLibs)" "YES"
+# There are too many symbols to make a Windows DLL for the ghc package,
+# so we don't build it the dyn way; see trac #5987
+ifneq "$(TargetOS_CPP)" "mingw32"
 compiler_stage2_CONFIGURE_OPTS += --enable-shared
 # If we are going to use dynamic libraries instead of .o files for ghci,
 # we will need to always retain CAFs in the compiler.
@@ -318,6 +321,7 @@ compiler_stage2_CONFIGURE_OPTS += --enable-shared
 # code is run.
 compiler_stage2_CONFIGURE_OPTS += --flags=dynlibs
 endif
+endif
 
 ifeq "$(GhcEnableTablesNextToCode) $(GhcUnregisterised)" "YES NO"
 # Should GHCI be building info tables in the TABLES_NEXT_TO_CODE style
@@ -351,7 +355,7 @@ ifeq "$(GhcProfiled)" "YES"
 # parts of the compiler of interest, and then add further cost centres
 # as necessary.  Turn on -auto-all for individual modules like this:
 
-compiler/main/DriverPipeline_HC_OPTS += -auto-all
+# compiler/main/DriverPipeline_HC_OPTS += -auto-all
 compiler/main/GhcMake_HC_OPTS        += -auto-all
 compiler/main/GHC_HC_OPTS            += -auto-all
 
@@ -423,6 +427,14 @@ compiler_stage1_SplitObjs = NO
 compiler_stage2_SplitObjs = NO
 compiler_stage3_SplitObjs = NO
 
+ifeq "$(TargetOS_CPP)" "mingw32"
+# There are too many symbols to make a Windows DLL for the ghc package,
+# so we don't build it the dyn way; see trac #5987
+compiler_stage1_EXCLUDED_WAYS := dyn
+compiler_stage2_EXCLUDED_WAYS := dyn
+compiler_stage3_EXCLUDED_WAYS := dyn
+endif
+
 # if stage is set to something other than "1" or "", disable stage 1
 ifneq "$(filter-out 1,$(stage))" ""
 compiler_stage1_NOT_NEEDED = YES
diff --git a/compiler/ghci/ByteCodeGen.lhs b/compiler/ghci/ByteCodeGen.lhs
index 3e4860cf9e..d722964bcd 100644
--- a/compiler/ghci/ByteCodeGen.lhs
+++ b/compiler/ghci/ByteCodeGen.lhs
@@ -66,6 +66,7 @@ import Module
 import Data.Map (Map)
 import qualified Data.Map as Map
 import qualified FiniteMap as Map
+import Data.Ord
 
 -- -----------------------------------------------------------------------------
 -- Generating byte code for a complete module
@@ -875,7 +876,7 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
         bitmap_size' :: Int
         bitmap_size' = fromIntegral bitmap_size
         bitmap = intsToReverseBitmap bitmap_size'{-size-}
-                        (sortLe (<=) (filter (< bitmap_size') rel_slots))
+                        (sort (filter (< bitmap_size') rel_slots))
           where
           binds = Map.toList p
           -- NB: unboxed tuple cases bind the scrut binder to the same offset
@@ -1305,7 +1306,7 @@ mkMultiBranch maybe_ncons raw_ways = do
              -- shouldn't happen?
 
          mkTree [val] range_lo range_hi
-            | range_lo `eqAlt` range_hi
+            | range_lo == range_hi
             = return (snd val)
             | null defaults -- Note [CASEFAIL]
             = do lbl <- getLabelBc
@@ -1349,9 +1350,7 @@ mkMultiBranch maybe_ncons raw_ways = do
      return (instrs `appOL` the_default)
   where
          (defaults, not_defaults) = partition (isNoDiscr.fst) raw_ways
-         notd_ways = sortLe
-                        (\w1 w2 -> leAlt (fst w1) (fst w2))
-                        not_defaults
+         notd_ways = sortBy (comparing fst) not_defaults
 
          testLT (DiscrI i) fail_label = TESTLT_I i fail_label
          testLT (DiscrW i) fail_label = TESTLT_W i fail_label
@@ -1386,22 +1385,6 @@ mkMultiBranch maybe_ncons raw_ways = do
                  Just n  -> (0, fromIntegral n - 1)
                  Nothing -> (minBound, maxBound)
 
-         (DiscrI i1) `eqAlt` (DiscrI i2) = i1 == i2
-         (DiscrW w1) `eqAlt` (DiscrW w2) = w1 == w2
-         (DiscrF f1) `eqAlt` (DiscrF f2) = f1 == f2
-         (DiscrD d1) `eqAlt` (DiscrD d2) = d1 == d2
-         (DiscrP i1) `eqAlt` (DiscrP i2) = i1 == i2
-         NoDiscr     `eqAlt` NoDiscr     = True
-         _           `eqAlt` _           = False
-
-         (DiscrI i1) `leAlt` (DiscrI i2) = i1 <= i2
-         (DiscrW w1) `leAlt` (DiscrW w2) = w1 <= w2
-         (DiscrF f1) `leAlt` (DiscrF f2) = f1 <= f2
-         (DiscrD d1) `leAlt` (DiscrD d2) = d1 <= d2
-         (DiscrP i1) `leAlt` (DiscrP i2) = i1 <= i2
-         NoDiscr     `leAlt` NoDiscr     = True
-         _           `leAlt` _           = False
-
          isNoDiscr NoDiscr = True
          isNoDiscr _       = False
 
@@ -1431,6 +1414,7 @@ data Discr
    | DiscrD Double
    | DiscrP Word16
    | NoDiscr
+    deriving (Eq, Ord)
 
 instance Outputable Discr where
    ppr (DiscrI i) = int i
diff --git a/compiler/ghci/Linker.lhs b/compiler/ghci/Linker.lhs
index a16832b3b3..06096c3579 100644
--- a/compiler/ghci/Linker.lhs
+++ b/compiler/ghci/Linker.lhs
@@ -287,7 +287,7 @@ reallyInitDynLinker dflags =
           -- (a) initialise the C dynamic linker
         ; initObjLinker
 
-          -- (b) Load packages from the command-line
+          -- (b) Load packages from the command-line (Note [preload packages])
         ; pls <- linkPackages' dflags (preloadPackages (pkgState dflags)) pls0
 
           -- (c) Link libraries from the command-line
@@ -325,6 +325,32 @@ reallyInitDynLinker dflags =
         ; return pls
         }}
 
+
+{- Note [preload packages]
+
+Why do we need to preload packages from the command line?  This is an
+explanation copied from #2437:
+
+I tried to implement the suggestion from #3560, thinking it would be
+easy, but there are two reasons we link in packages eagerly when they
+are mentioned on the command line:
+
+  * So that you can link in extra object files or libraries that
+    depend on the packages. e.g. ghc -package foo -lbar where bar is a
+    C library that depends on something in foo. So we could link in
+    foo eagerly if and only if there are extra C libs or objects to
+    link in, but....
+
+  * Haskell code can depend on a C function exported by a package, and
+    the normal dependency tracking that TH uses can't know about these
+    dependencies. The test ghcilink004 relies on this, for example.
+
+I conclude that we need two -package flags: one that says "this is a
+package I want to make available", and one that says "this is a
+package I want to link in eagerly". Would that be too complicated for
+users?
+-}
+
 classifyLdInput :: DynFlags -> FilePath -> IO (Maybe LibrarySpec)
 classifyLdInput dflags f
   | isObjectFilename f = return (Just (Object f))
diff --git a/compiler/hsSyn/HsBinds.lhs b/compiler/hsSyn/HsBinds.lhs
index 7de9018dbe..26097df6c4 100644
--- a/compiler/hsSyn/HsBinds.lhs
+++ b/compiler/hsSyn/HsBinds.lhs
@@ -34,13 +34,13 @@ import NameSet
 import BasicTypes
 import Outputable	
 import SrcLoc
-import Util
 import Var
 import Bag
 import FastString
 
 import Data.Data hiding ( Fixity )
-import Data.List ( intersect )
+import Data.List
+import Data.Ord
 \end{code}
 
 %************************************************************************
@@ -267,7 +267,7 @@ pprLHsBindsForUser binds sigs
     decls = [(loc, ppr sig)  | L loc sig <- sigs] ++
             [(loc, ppr bind) | L loc bind <- bagToList binds]
 
-    sort_by_loc decls = sortLe (\(l1,_) (l2,_) -> l1 <= l2) decls
+    sort_by_loc decls = sortBy (comparing fst) decls
 
 pprDeclList :: [SDoc] -> SDoc   -- Braces with a space
 -- Print a bunch of declarations
diff --git a/compiler/iface/BinIface.hs b/compiler/iface/BinIface.hs
index f749f97cdb..201e7bb900 100644
--- a/compiler/iface/BinIface.hs
+++ b/compiler/iface/BinIface.hs
@@ -23,6 +23,7 @@ import TyCon      (TyCon, tyConName, tupleTyConSort, tupleTyConArity, isTupleTyC
 import DataCon    (dataConName, dataConWorkId, dataConTyCon)
 import PrelInfo   (wiredInThings, basicKnownKeyNames)
 import Id         (idName, isDataConWorkId_maybe)
+import CoreSyn    (DFunArg(..))
 import TysWiredIn
 import IfaceEnv
 import HscTypes
@@ -1180,13 +1181,21 @@ instance Binary IfaceBinding where
 instance Binary IfaceIdDetails where
     put_ bh IfVanillaId      = putByte bh 0
     put_ bh (IfRecSelId a b) = putByte bh 1 >> put_ bh a >> put_ bh b
-    put_ bh IfDFunId         = putByte bh 2
+    put_ bh (IfDFunId n)     = do { putByte bh 2; put_ bh n }
     get bh = do
         h <- getByte bh
         case h of
             0 -> return IfVanillaId
             1 -> do { a <- get bh; b <- get bh; return (IfRecSelId a b) }
-            _ -> return IfDFunId
+            _ -> do { n <- get bh; return (IfDFunId n) }
+
+instance Binary (DFunArg IfaceExpr) where
+    put_ bh (DFunPolyArg  e) = putByte bh 0 >> put_ bh e
+    put_ bh (DFunLamArg i)   = putByte bh 1 >> put_ bh i
+    get bh = do { h <- getByte bh
+                ; case h of
+                    0 -> do { a <- get bh; return (DFunPolyArg a) }
+                    _ -> do { a <- get bh; return (DFunLamArg a) } }
 
 instance Binary IfaceIdInfo where
     put_ bh NoInfo      = putByte bh 0
diff --git a/compiler/iface/IfaceSyn.lhs b/compiler/iface/IfaceSyn.lhs
index b53398da7d..bc5fc954eb 100644
--- a/compiler/iface/IfaceSyn.lhs
+++ b/compiler/iface/IfaceSyn.lhs
@@ -35,6 +35,8 @@ module IfaceSyn (
 #include "HsVersions.h"
 
 import IfaceType
+import CoreSyn( DFunArg, dfunArgExprs )
+import PprCore()            -- Printing DFunArgs
 import Demand
 import Annotations
 import Class
@@ -194,7 +196,7 @@ type IfaceAnnTarget = AnnTarget OccName
 data IfaceIdDetails
   = IfVanillaId
   | IfRecSelId IfaceTyCon Bool
-  | IfDFunId 
+  | IfDFunId Int          -- Number of silent args
 
 data IfaceIdInfo
   = NoInfo                      -- When writing interface file without -O
@@ -237,7 +239,7 @@ data IfaceUnfolding
   | IfLclWrapper Arity IfLclName  --     because the worker can simplify to a function in
                                   --     another module.
 
-  | IfDFunUnfold [IfaceExpr]
+  | IfDFunUnfold [DFunArg IfaceExpr]
 
 --------------------------------
 data IfaceExpr
@@ -701,7 +703,7 @@ instance Outputable IfaceIdDetails where
   ppr IfVanillaId       = empty
   ppr (IfRecSelId tc b) = ptext (sLit "RecSel") <+> ppr tc
                           <+> if b then ptext (sLit "<naughty>") else empty
-  ppr IfDFunId          = ptext (sLit "DFunId")
+  ppr (IfDFunId ns)     = ptext (sLit "DFunId") <> brackets (int ns)
 
 instance Outputable IfaceIdInfo where
   ppr NoInfo       = empty
@@ -856,7 +858,7 @@ freeNamesIfUnfold (IfCompulsory e)       = freeNamesIfExpr e
 freeNamesIfUnfold (IfInlineRule _ _ _ e) = freeNamesIfExpr e
 freeNamesIfUnfold (IfExtWrapper _ v)     = unitNameSet v
 freeNamesIfUnfold (IfLclWrapper {})      = emptyNameSet
-freeNamesIfUnfold (IfDFunUnfold vs)      = fnList freeNamesIfExpr vs
+freeNamesIfUnfold (IfDFunUnfold vs)      = fnList freeNamesIfExpr (dfunArgExprs vs)
 
 freeNamesIfExpr :: IfaceExpr -> NameSet
 freeNamesIfExpr (IfaceExt v)      = unitNameSet v
diff --git a/compiler/iface/MkIface.lhs b/compiler/iface/MkIface.lhs
index d51fdd460a..ce07b375b3 100644
--- a/compiler/iface/MkIface.lhs
+++ b/compiler/iface/MkIface.lhs
@@ -107,9 +107,11 @@ import Bag
 import Exception
 
 import Control.Monad
+import Data.Function
 import Data.List
 import Data.Map (Map)
 import qualified Data.Map as Map
+import Data.Ord
 import Data.IORef
 import System.Directory
 import System.FilePath
@@ -277,9 +279,9 @@ mkIface_ hsc_env maybe_old_fingerprint
         
                         -- Sort these lexicographically, so that
                         -- the result is stable across compilations
-                        mi_insts       = sortLe le_inst iface_insts,
-                        mi_fam_insts   = sortLe le_fam_inst iface_fam_insts,
-                        mi_rules       = sortLe le_rule iface_rules,
+                        mi_insts       = sortBy cmp_inst     iface_insts,
+                        mi_fam_insts   = sortBy cmp_fam_inst iface_fam_insts,
+                        mi_rules       = sortBy cmp_rule     iface_rules,
 
                         mi_vect_info   = iface_vect_info,
 
@@ -347,14 +349,11 @@ mkIface_ hsc_env maybe_old_fingerprint
 
         ; return (errs_and_warns, Just (final_iface, no_change_at_all)) }}
   where
-     r1 `le_rule`     r2 = ifRuleName      r1    <=    ifRuleName      r2
-     i1 `le_inst`     i2 = ifDFun          i1 `le_occ` ifDFun          i2  
-     i1 `le_fam_inst` i2 = ifFamInstTcName i1 `le_occ` ifFamInstTcName i2
-
-     le_occ :: Name -> Name -> Bool
-        -- Compare lexicographically by OccName, *not* by unique, because 
-        -- the latter is not stable across compilations
-     le_occ n1 n2 = nameOccName n1 <= nameOccName n2
+     cmp_rule     = comparing ifRuleName
+     -- Compare these lexicographically by OccName, *not* by unique,
+     -- because the latter is not stable across compilations:
+     cmp_inst     = comparing (nameOccName . ifDFun)
+     cmp_fam_inst = comparing (nameOccName . ifFamInstTcName)
 
      dflags = hsc_dflags hsc_env
 
@@ -1685,7 +1684,7 @@ toIfaceLetBndr id  = IfLetBndr (occNameFS (getOccName id))
 --------------------------
 toIfaceIdDetails :: IdDetails -> IfaceIdDetails
 toIfaceIdDetails VanillaId                      = IfVanillaId
-toIfaceIdDetails (DFunId {})                    = IfDFunId 
+toIfaceIdDetails (DFunId ns _)                  = IfDFunId ns
 toIfaceIdDetails (RecSelId { sel_naughty = n
                            , sel_tycon = tc })  = IfRecSelId (toIfaceTyCon tc) n
 toIfaceIdDetails other                          = pprTrace "toIfaceIdDetails" (ppr other) 
@@ -1750,7 +1749,7 @@ toIfUnfolding lb (CoreUnfolding { uf_tmpl = rhs, uf_arity = arity
     if_rhs = toIfaceExpr rhs
 
 toIfUnfolding lb (DFunUnfolding _ar _con ops)
-  = Just (HsUnfold lb (IfDFunUnfold (map toIfaceExpr ops)))
+  = Just (HsUnfold lb (IfDFunUnfold (map (fmap toIfaceExpr) ops)))
       -- No need to serialise the data constructor; 
       -- we can recover it from the type of the dfun
 
diff --git a/compiler/iface/TcIface.lhs b/compiler/iface/TcIface.lhs
index e63bf7268f..80c2029a70 100644
--- a/compiler/iface/TcIface.lhs
+++ b/compiler/iface/TcIface.lhs
@@ -1160,8 +1160,8 @@ do_one (IfaceRec pairs) thing_inside
 \begin{code}
 tcIdDetails :: Type -> IfaceIdDetails -> IfL IdDetails
 tcIdDetails _  IfVanillaId = return VanillaId
-tcIdDetails ty IfDFunId
-  = return (DFunId (isNewTyCon (classTyCon cls)))
+tcIdDetails ty (IfDFunId ns)
+  = return (DFunId ns (isNewTyCon (classTyCon cls)))
   where
     (_, _, cls, _) = tcSplitDFunTy ty
 
@@ -1225,12 +1225,14 @@ tcUnfolding name _ _ (IfInlineRule arity unsat_ok boring_ok if_expr)
     }
 
 tcUnfolding name dfun_ty _ (IfDFunUnfold ops)
-  = do { mb_ops1 <- forkM_maybe doc $ mapM tcIfaceExpr ops
+  = do { mb_ops1 <- forkM_maybe doc $ mapM tc_arg ops
        ; return (case mb_ops1 of
                     Nothing   -> noUnfolding
                     Just ops1 -> mkDFunUnfolding dfun_ty ops1) }
   where
     doc = text "Class ops for dfun" <+> ppr name
+    tc_arg (DFunPolyArg  e) = do { e' <- tcIfaceExpr e; return (DFunPolyArg e') }
+    tc_arg (DFunLamArg i)   = return (DFunLamArg i)
 
 tcUnfolding name ty info (IfExtWrapper arity wkr)
   = tcIfaceWrapper name ty info arity (tcIfaceExtId wkr)
diff --git a/compiler/llvmGen/LlvmCodeGen.hs b/compiler/llvmGen/LlvmCodeGen.hs
index 4465957de4..5c2e420545 100644
--- a/compiler/llvmGen/LlvmCodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen.hs
@@ -27,6 +27,7 @@ import UniqSupply
 import Util
 import SysTools ( figureLlvmVersion )
 
+import Control.Monad ( when )
 import Data.IORef ( writeIORef )
 import Data.Maybe ( fromMaybe )
 import System.IO
@@ -51,9 +52,7 @@ llvmCodeGen dflags h us cmms
         dumpIfSet_dyn dflags Opt_D_dump_llvm "LLVM Code" pprLlvmHeader
         bufh <- newBufHandle h
         Prt.bufLeftRender bufh $ withPprStyleDoc dflags (mkCodeStyle CStyle) pprLlvmHeader
-        ver  <- (fromMaybe defaultLlvmVersion) `fmap` figureLlvmVersion dflags
-        -- cache llvm version for later use
-        writeIORef (llvmVersion dflags) ver
+        ver  <- getLlvmVersion
         env' <- {-# SCC "llvm_datas_gen" #-}
                 cmmDataLlvmGens dflags bufh (setLlvmVer ver env) cdata []
         {-# SCC "llvm_procs_gen" #-}
@@ -61,6 +60,22 @@ llvmCodeGen dflags h us cmms
         bFlush bufh
         return  ()
 
+  where
+    -- | Handle setting up the LLVM version.
+    getLlvmVersion = do
+        ver <- (fromMaybe defaultLlvmVersion) `fmap` figureLlvmVersion dflags
+        -- cache llvm version for later use
+        writeIORef (llvmVersion dflags) ver
+        when (ver < minSupportLlvmVersion) $
+            errorMsg dflags (text "You are using an old version of LLVM that"
+                             <> text " isn't supported anymore!"
+                             $+$ text "We will try though...")
+        when (ver > maxSupportLlvmVersion) $
+            putMsg dflags (text "You are using a new version of LLVM that"
+                           <> text " hasn't been tested yet!"
+                           $+$ text "We will try though...")
+        return ver
+
 
 -- -----------------------------------------------------------------------------
 -- | Do LLVM code generation on all these Cmms data sections.
diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs
index 2239dbb006..19ca511f16 100644
--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -9,7 +9,8 @@ module LlvmCodeGen.Base (
         LlvmCmmDecl, LlvmBasicBlock,
         LlvmUnresData, LlvmData, UnresLabel, UnresStatic,
 
-        LlvmVersion, defaultLlvmVersion,
+        LlvmVersion, defaultLlvmVersion, minSupportLlvmVersion,
+        maxSupportLlvmVersion,
 
         LlvmEnv, initLlvmEnv, clearVars, varLookup, varInsert,
         funLookup, funInsert, getLlvmVer, setLlvmVer, getLlvmPlatform,
@@ -144,7 +145,13 @@ type LlvmVersion = Int
 
 -- | The LLVM Version we assume if we don't know
 defaultLlvmVersion :: LlvmVersion
-defaultLlvmVersion = 28
+defaultLlvmVersion = 30
+
+minSupportLlvmVersion :: LlvmVersion
+minSupportLlvmVersion = 28
+
+maxSupportLlvmVersion :: LlvmVersion
+maxSupportLlvmVersion = 31
 
 -- ----------------------------------------------------------------------------
 -- * Environment Handling
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index e9d8ac52a8..79a0c00543 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -202,11 +202,12 @@ genCall env t@(CmmPrim (MO_PopCnt w) _) [CmmHinted dst _] args _ = do
 
 -- Handle memcpy function specifically since llvm's intrinsic version takes
 -- some extra parameters.
-genCall env t@(CmmPrim op _) [] args CmmMayReturn
+genCall env t@(CmmPrim op _) [] args' CmmMayReturn
  | op == MO_Memcpy ||
    op == MO_Memset ||
    op == MO_Memmove = do
-    let (isVolTy, isVolVal) = if getLlvmVer env >= 28
+    let (args, alignVal) = splitAlignVal args'
+        (isVolTy, isVolVal) = if getLlvmVer env >= 28
                                  then ([i1], [mkIntLit i1 0]) else ([], [])
         argTy | op == MO_Memset = [i8Ptr, i8,    llvmWord, i32] ++ isVolTy
               | otherwise       = [i8Ptr, i8Ptr, llvmWord, i32] ++ isVolTy
@@ -217,11 +218,22 @@ genCall env t@(CmmPrim op _) [] args CmmMayReturn
     (env2, fptr, stmts2, top2)    <- getFunPtr env1 funTy t
     (argVars', stmts3)            <- castVars $ zip argVars argTy
 
-    let arguments = argVars' ++ isVolVal
+    let arguments = argVars' ++ (alignVal:isVolVal)
         call = Expr $ Call StdCall fptr arguments []
         stmts = stmts1 `appOL` stmts2 `appOL` stmts3
                 `appOL` trashStmts `snocOL` call
     return (env2, stmts, top1 ++ top2)
+  
+  where
+    splitAlignVal xs = (init xs, extractLit $ last xs)
+
+    -- Fix for trac #6158. Since LLVM 3.1, opt fails when given anything other
+    -- than a direct constant (i.e. 'i32 8') as the alignment argument for the
+    -- memcpy & co llvm intrinsic functions. So we handle this directly now.
+    extractLit (CmmHinted (CmmLit (CmmInt i _)) _) = mkIntLit i32 i
+    extractLit _other = trace ("WARNING: Non constant alignment value given" ++ 
+                               " for memcpy! Please report to GHC developers")
+                        mkIntLit i32 0
 
 genCall env (CmmPrim _ (Just stmts)) _ _ _
     = stmtsToInstrs env stmts (nilOL, [])
diff --git a/compiler/main/CodeOutput.lhs b/compiler/main/CodeOutput.lhs
index 24906671cd..e92eb4f34c 100644
--- a/compiler/main/CodeOutput.lhs
+++ b/compiler/main/CodeOutput.lhs
@@ -15,22 +15,22 @@ import UniqSupply       ( mkSplitUniqSupply )
 
 import Finder           ( mkStubPaths )
 import PprC             ( writeCs )
-import CmmLint          ( cmmLint )
+import OldCmmLint       ( cmmLint )
 import Packages
 import OldCmm           ( RawCmmGroup )
 import HscTypes
 import DynFlags
 import Config
 import SysTools
+import Stream           (Stream)
+import qualified Stream
 
 import ErrUtils
 import Outputable
 import Module
-import Maybes           ( firstJusts )
 import SrcLoc
 
 import Control.Exception
-import Control.Monad
 import System.Directory
 import System.FilePath
 import System.IO
@@ -48,19 +48,26 @@ codeOutput :: DynFlags
            -> ModLocation
            -> ForeignStubs
            -> [PackageId]
-           -> [RawCmmGroup]                       -- Compiled C--
+           -> Stream IO RawCmmGroup ()                       -- Compiled C--
            -> IO (Bool{-stub_h_exists-}, Maybe FilePath{-stub_c_exists-})
 
-codeOutput dflags this_mod location foreign_stubs pkg_deps flat_abstractC
+codeOutput dflags this_mod location foreign_stubs pkg_deps cmm_stream
   = 
-    do  { when (dopt Opt_DoCmmLinting dflags) $ do
+    do  {
+        -- Lint each CmmGroup as it goes past
+        ; let linted_cmm_stream =
+                 if dopt Opt_DoCmmLinting dflags
+                    then Stream.mapM do_lint cmm_stream
+                    else cmm_stream
+
+              do_lint cmm = do
                 { showPass dflags "CmmLint"
-                ; let lints = map (cmmLint (targetPlatform dflags)) flat_abstractC
-                ; case firstJusts lints of
+                ; case cmmLint (targetPlatform dflags) cmm of
                         Just err -> do { log_action dflags dflags SevDump noSrcSpan defaultDumpStyle err
                                        ; ghcExit dflags 1
                                        }
                         Nothing  -> return ()
+                ; return cmm
                 }
 
         ; showPass dflags "CodeOutput"
@@ -68,9 +75,9 @@ codeOutput dflags this_mod location foreign_stubs pkg_deps flat_abstractC
         ; stubs_exist <- outputForeignStubs dflags this_mod location foreign_stubs
         ; case hscTarget dflags of {
              HscInterpreted -> return ();
-             HscAsm         -> outputAsm dflags filenm flat_abstractC;
-             HscC           -> outputC dflags filenm flat_abstractC pkg_deps;
-             HscLlvm        -> outputLlvm dflags filenm flat_abstractC;
+             HscAsm         -> outputAsm dflags filenm linted_cmm_stream;
+             HscC           -> outputC dflags filenm linted_cmm_stream pkg_deps;
+             HscLlvm        -> outputLlvm dflags filenm linted_cmm_stream;
              HscNothing     -> panic "codeOutput: HscNothing"
           }
         ; return stubs_exist
@@ -90,12 +97,16 @@ doOutput filenm io_action = bracket (openFile filenm WriteMode) hClose io_action
 \begin{code}
 outputC :: DynFlags
         -> FilePath
-        -> [RawCmmGroup]
+        -> Stream IO RawCmmGroup ()
         -> [PackageId]
         -> IO ()
 
-outputC dflags filenm flat_absC packages
+outputC dflags filenm cmm_stream packages
   = do 
+       -- ToDo: make the C backend consume the C-- incrementally, by
+       -- pushing the cmm_stream inside (c.f. nativeCodeGen)
+       rawcmms <- Stream.collect cmm_stream
+
        -- figure out which header files to #include in the generated .hc file:
        --
        --   * extra_includes from packages
@@ -117,7 +128,7 @@ outputC dflags filenm flat_absC packages
        doOutput filenm $ \ h -> do
           hPutStr h ("/* GHC_PACKAGES " ++ unwords pkg_names ++ "\n*/\n")
           hPutStr h cc_injects
-          writeCs dflags h flat_absC
+          writeCs dflags h rawcmms
 \end{code}
 
 
@@ -128,14 +139,14 @@ outputC dflags filenm flat_absC packages
 %************************************************************************
 
 \begin{code}
-outputAsm :: DynFlags -> FilePath -> [RawCmmGroup] -> IO ()
-outputAsm dflags filenm flat_absC
+outputAsm :: DynFlags -> FilePath -> Stream IO RawCmmGroup () -> IO ()
+outputAsm dflags filenm cmm_stream
  | cGhcWithNativeCodeGen == "YES"
   = do ncg_uniqs <- mkSplitUniqSupply 'n'
 
        {-# SCC "OutputAsm" #-} doOutput filenm $
            \f -> {-# SCC "NativeCodeGen" #-}
-                 nativeCodeGen dflags f ncg_uniqs flat_absC
+                 nativeCodeGen dflags f ncg_uniqs cmm_stream
 
  | otherwise
   = panic "This compiler was built without a native code generator"
@@ -149,12 +160,17 @@ outputAsm dflags filenm flat_absC
 %************************************************************************
 
 \begin{code}
-outputLlvm :: DynFlags -> FilePath -> [RawCmmGroup] -> IO ()
-outputLlvm dflags filenm flat_absC
+outputLlvm :: DynFlags -> FilePath -> Stream IO RawCmmGroup () -> IO ()
+outputLlvm dflags filenm cmm_stream
   = do ncg_uniqs <- mkSplitUniqSupply 'n'
+
+       -- ToDo: make the LLVM backend consume the C-- incrementally,
+       -- by pushing the cmm_stream inside (c.f. nativeCodeGen)
+       rawcmms <- Stream.collect cmm_stream
+
        {-# SCC "llvm_output" #-} doOutput filenm $
            \f -> {-# SCC "llvm_CodeGen" #-}
-                 llvmCodeGen dflags f ncg_uniqs flat_absC
+                 llvmCodeGen dflags f ncg_uniqs rawcmms
 \end{code}
 
 
@@ -240,4 +256,3 @@ outputForeignStubs_help fname doc_str header footer
    = do writeFile fname (header ++ doc_str ++ '\n':footer ++ "\n")
         return True
 \end{code}
-
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 014b721a1b..60b6e82bb7 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -83,7 +83,13 @@ module DynFlags (
         -- ** Parsing DynFlags
         parseDynamicFlagsCmdLine,
         parseDynamicFilePragma,
+        parseDynamicFlagsFull,
+
+        -- ** Available DynFlags
         allFlags,
+        flagsAll,
+        flagsDynamic,
+        flagsPackage,
 
         supportedLanguagesAndExtensions,
 
@@ -158,9 +164,9 @@ data DynFlag
    = Opt_D_dump_cmm
    | Opt_D_dump_raw_cmm
    | Opt_D_dump_cmmz
-   | Opt_D_dump_cmmz_pretty
    -- All of the cmmz subflags (there are a lot!)  Automatically
    -- enabled if you run -ddump-cmmz
+   | Opt_D_dump_cmmz_cfg
    | Opt_D_dump_cmmz_cbe
    | Opt_D_dump_cmmz_proc
    | Opt_D_dump_cmmz_spills
@@ -626,6 +632,8 @@ data DynFlags = DynFlags {
   -- | what kind of {-# SCC #-} to add automatically
   profAuto              :: ProfAuto,
 
+  interactivePrint      :: Maybe String,
+
   llvmVersion           :: IORef (Int)
  }
 
@@ -983,7 +991,8 @@ defaultDynFlags mySettings =
         pprCols = 100,
         traceLevel = 1,
         profAuto = NoProfAuto,
-        llvmVersion = panic "defaultDynFlags: No llvmVersion"
+        llvmVersion = panic "defaultDynFlags: No llvmVersion",
+        interactivePrint = Nothing
       }
 
 -- Do not use tracingDynFlags!
@@ -1245,7 +1254,8 @@ setObjectDir, setHiDir, setStubDir, setDumpDir, setOutputDir,
          setDylibInstallName,
          setObjectSuf, setHiSuf, setHcSuf, parseDynLibLoaderMode,
          setPgmP, addOptl, addOptP,
-         addCmdlineFramework, addHaddockOpts, addGhciScript
+         addCmdlineFramework, addHaddockOpts, addGhciScript, 
+         setInteractivePrint
    :: String -> DynFlags -> DynFlags
 setOutputFile, setOutputHi, setDumpPrefixForce
    :: Maybe String -> DynFlags -> DynFlags
@@ -1319,6 +1329,8 @@ addHaddockOpts f d = d{ haddockOptions = Just f}
 
 addGhciScript f d = d{ ghciScripts = f : ghciScripts d}
 
+setInteractivePrint f d = d{ interactivePrint = Just f}
+
 -- -----------------------------------------------------------------------------
 -- Command-line options
 
@@ -1386,31 +1398,39 @@ getStgToDo dflags
 -- -----------------------------------------------------------------------------
 -- Parsing the dynamic flags.
 
+
 -- | Parse dynamic flags from a list of command line arguments.  Returns the
 -- the parsed 'DynFlags', the left-over arguments, and a list of warnings.
 -- Throws a 'UsageError' if errors occurred during parsing (such as unknown
 -- flags or missing arguments).
-parseDynamicFlagsCmdLine :: Monad m =>
-                     DynFlags -> [Located String]
-                  -> m (DynFlags, [Located String], [Located String])
-                     -- ^ Updated 'DynFlags', left-over arguments, and
-                     -- list of warnings.
-parseDynamicFlagsCmdLine dflags args = parseDynamicFlags dflags args True
+parseDynamicFlagsCmdLine :: Monad m => DynFlags -> [Located String]
+                         -> m (DynFlags, [Located String], [Located String])
+                            -- ^ Updated 'DynFlags', left-over arguments, and
+                            -- list of warnings.
+parseDynamicFlagsCmdLine = parseDynamicFlagsFull flagsAll True
+
 
 -- | Like 'parseDynamicFlagsCmdLine' but does not allow the package flags
 -- (-package, -hide-package, -ignore-package, -hide-all-packages, -package-db).
 -- Used to parse flags set in a modules pragma.
-parseDynamicFilePragma :: Monad m =>
-                     DynFlags -> [Located String]
-                  -> m (DynFlags, [Located String], [Located String])
-                     -- ^ Updated 'DynFlags', left-over arguments, and
-                     -- list of warnings.
-parseDynamicFilePragma dflags args = parseDynamicFlags dflags args False
-
-parseDynamicFlags :: Monad m =>
-                      DynFlags -> [Located String] -> Bool
+parseDynamicFilePragma :: Monad m => DynFlags -> [Located String]
+                       -> m (DynFlags, [Located String], [Located String])
+                          -- ^ Updated 'DynFlags', left-over arguments, and
+                          -- list of warnings.
+parseDynamicFilePragma = parseDynamicFlagsFull flagsDynamic False
+
+
+-- | Parses the dynamically set flags for GHC. This is the most general form of
+-- the dynamic flag parser that the other methods simply wrap. It allows
+-- saying which flags are valid flags and indicating if we are parsing
+-- arguments from the command line or from a file pragma.
+parseDynamicFlagsFull :: Monad m
+                  => [Flag (CmdLineP DynFlags)]    -- ^ valid flags to match against
+                  -> Bool                          -- ^ are the arguments from the command line?
+                  -> DynFlags                      -- ^ current dynamic flags
+                  -> [Located String]              -- ^ arguments to parse
                   -> m (DynFlags, [Located String], [Located String])
-parseDynamicFlags dflags0 args cmdline = do
+parseDynamicFlagsFull activeFlags cmdline dflags0 args = do
   -- XXX Legacy support code
   -- We used to accept things like
   --     optdep-f  -optdepdepend
@@ -1423,12 +1443,8 @@ parseDynamicFlags dflags0 args cmdline = do
       f xs = xs
       args' = f args
 
-      -- Note: -ignore-package (package_flags) must precede -i* (dynamic_flags)
-      flag_spec | cmdline   = package_flags ++ dynamic_flags
-                | otherwise = dynamic_flags
-
   let ((leftover, errs, warns), dflags1)
-          = runCmdLine (processArgs flag_spec args') dflags0
+          = runCmdLine (processArgs activeFlags args') dflags0
   when (not (null errs)) $ ghcError $ errorsToGhcException errs
 
   -- check for disabled flags in safe haskell
@@ -1436,8 +1452,12 @@ parseDynamicFlags dflags0 args cmdline = do
 
   return (dflags2, leftover, sh_warns ++ warns)
 
+
 -- | Check (and potentially disable) any extensions that aren't allowed
 -- in safe mode.
+--
+-- The bool is to indicate if we are parsing command line flags (false means
+-- file pragma). This allows us to generate better warnings.
 safeFlagCheck :: Bool -> DynFlags -> (DynFlags, [Located String])
 safeFlagCheck _  dflags | not (safeLanguageOn dflags || safeInferOn dflags)
                         = (dflags, [])
@@ -1483,6 +1503,8 @@ safeFlagCheck cmdl dflags =
 %*                                                                      *
 %********************************************************************* -}
 
+-- | All dynamic flags option strings. These are the user facing strings for
+-- enabling and disabling options.
 allFlags :: [String]
 allFlags = map ('-':) $
            [ flagName flag | flag <- dynamic_flags ++ package_flags, ok (flagOptKind flag) ] ++
@@ -1496,6 +1518,23 @@ allFlags = map ('-':) $
           fflags1 = [ name | (name, _, _) <- fWarningFlags ]
           fflags2 = [ name | (name, _, _) <- fLangFlags ]
 
+{-
+ - Below we export user facing symbols for GHC dynamic flags for use with the
+ - GHC API.
+ -}
+
+-- All dynamic flags present in GHC.
+flagsAll :: [Flag (CmdLineP DynFlags)]
+flagsAll     = package_flags ++ dynamic_flags
+
+-- All dynamic flags, minus package flags, present in GHC.
+flagsDynamic :: [Flag (CmdLineP DynFlags)]
+flagsDynamic = dynamic_flags
+
+-- ALl package flags present in GHC.
+flagsPackage :: [Flag (CmdLineP DynFlags)]
+flagsPackage = package_flags
+
 --------------- The main flags themselves ------------------
 dynamic_flags :: [Flag (CmdLineP DynFlags)]
 dynamic_flags = [
@@ -1610,7 +1649,7 @@ dynamic_flags = [
   , Flag "haddock-opts"   (hasArg addHaddockOpts)
   , Flag "hpcdir"         (SepArg setOptHpcDir)
   , Flag "ghci-script"    (hasArg addGhciScript)
-
+  , Flag "interactive-print" (hasArg setInteractivePrint)
         ------- recompilation checker --------------------------------------
   , Flag "recomp"         (NoArg (do unSetDynFlag Opt_ForceRecomp
                                      deprecate "Use -fno-force-recomp instead"))
@@ -1636,7 +1675,7 @@ dynamic_flags = [
   , Flag "ddump-cmm"               (setDumpFlag Opt_D_dump_cmm)
   , Flag "ddump-raw-cmm"           (setDumpFlag Opt_D_dump_raw_cmm)
   , Flag "ddump-cmmz"              (setDumpFlag Opt_D_dump_cmmz)
-  , Flag "ddump-cmmz-pretty"       (setDumpFlag Opt_D_dump_cmmz_pretty)
+  , Flag "ddump-cmmz-cfg"          (setDumpFlag Opt_D_dump_cmmz_cbe)
   , Flag "ddump-cmmz-cbe"          (setDumpFlag Opt_D_dump_cmmz_cbe)
   , Flag "ddump-cmmz-spills"       (setDumpFlag Opt_D_dump_cmmz_spills)
   , Flag "ddump-cmmz-proc"         (setDumpFlag Opt_D_dump_cmmz_proc)
diff --git a/compiler/main/ErrUtils.lhs b/compiler/main/ErrUtils.lhs
index 5f5769d1c9..daa66f9d2f 100644
--- a/compiler/main/ErrUtils.lhs
+++ b/compiler/main/ErrUtils.lhs
@@ -37,7 +37,6 @@ module ErrUtils (
 
 import Bag              ( Bag, bagToList, isEmptyBag, emptyBag )
 import Exception
-import Util
 import Outputable
 import Panic
 import FastString
@@ -51,6 +50,7 @@ import System.FilePath
 import Data.List
 import qualified Data.Set as Set
 import Data.IORef
+import Data.Ord
 import Control.Monad
 import System.IO
 
@@ -178,13 +178,8 @@ printMsgBag dflags bag
                          errMsgContext   = unqual } <- sortMsgBag bag ]
 
 sortMsgBag :: Bag ErrMsg -> [ErrMsg]
-sortMsgBag bag = sortLe srcOrder $ bagToList bag
-  where
-    srcOrder err1 err2 =
-        case compare (head (errMsgSpans err1)) (head (errMsgSpans err2)) of
-            LT -> True
-            EQ -> True
-            GT -> False
+sortMsgBag bag = sortBy (comparing (head . errMsgSpans)) $ bagToList bag
+                 -- TODO: Why "head ."? Why not compare the whole list?
 
 ghcExit :: DynFlags -> Int -> IO ()
 ghcExit dflags val
diff --git a/compiler/main/HscMain.hs b/compiler/main/HscMain.hs
index 562332d52a..0b03e83029 100644
--- a/compiler/main/HscMain.hs
+++ b/compiler/main/HscMain.hs
@@ -119,13 +119,12 @@ import TyCon
 import Name
 import SimplStg         ( stg2stg )
 import CodeGen          ( codeGen )
-import OldCmm as Old    ( CmmGroup )
-import PprCmm           ( pprCmms )
+import qualified OldCmm as Old
+import qualified Cmm as New
 import CmmParse         ( parseCmmFile )
 import CmmBuildInfoTables
 import CmmPipeline
 import CmmInfo
-import OptimizationFuel ( initOptFuelState )
 import CmmCvt
 import CodeOutput
 import NameEnv          ( emptyNameEnv )
@@ -147,6 +146,9 @@ import UniqFM           ( emptyUFM )
 import UniqSupply       ( initUs_ )
 import Bag
 import Exception
+import qualified Stream
+import Stream (Stream)
+
 import Util
 
 import Data.List
@@ -172,7 +174,6 @@ newHscEnv dflags = do
     nc_var  <- newIORef (initNameCache us knownKeyNames)
     fc_var  <- newIORef emptyUFM
     mlc_var <- newIORef emptyModuleEnv
-    optFuel <- initOptFuelState
     return HscEnv {  hsc_dflags       = dflags,
                      hsc_targets      = [],
                      hsc_mod_graph    = [],
@@ -182,7 +183,6 @@ newHscEnv dflags = do
                      hsc_NC           = nc_var,
                      hsc_FC           = fc_var,
                      hsc_MLC          = mlc_var,
-                     hsc_OptFuel      = optFuel,
                      hsc_type_env_var = Nothing }
 
 
@@ -1276,20 +1276,27 @@ hscGenHardCode cgguts mod_summary = do
                              cost_centre_info
                              stg_binds hpc_info
                     else {-# SCC "CodeGen" #-}
-                         codeGen dflags this_mod data_tycons
-                             cost_centre_info
-                             stg_binds hpc_info
+                         return (codeGen dflags this_mod data_tycons
+                               cost_centre_info
+                               stg_binds hpc_info)
+
 
         ------------------  Code output -----------------------
-        rawcmms <- {-# SCC "cmmToRawCmm" #-}
+        rawcmms0 <- {-# SCC "cmmToRawCmm" #-}
                    cmmToRawCmm platform cmms
-        dumpIfSet_dyn dflags Opt_D_dump_raw_cmm "Raw Cmm" (ppr rawcmms)
+
+        let dump a = do dumpIfSet_dyn dflags Opt_D_dump_raw_cmm "Raw Cmm"
+                           (ppr a)
+                        return a
+            rawcmms1 = Stream.mapM dump rawcmms0
+
         (_stub_h_exists, stub_c_exists)
             <- {-# SCC "codeOutput" #-}
                codeOutput dflags this_mod location foreign_stubs
-               dependencies rawcmms
+               dependencies rawcmms1
         return stub_c_exists
 
+
 hscInteractive :: (ModIface, ModDetails, CgGuts)
                -> ModSummary
                -> Hsc (InteractiveStatus, ModIface, ModDetails)
@@ -1335,7 +1342,7 @@ hscCompileCmmFile hsc_env filename = runHsc hsc_env $ do
     let dflags = hsc_dflags hsc_env
     cmm <- ioMsgMaybe $ parseCmmFile dflags filename
     liftIO $ do
-        rawCmms <- cmmToRawCmm (targetPlatform dflags) [cmm]
+        rawCmms <- cmmToRawCmm (targetPlatform dflags) (Stream.yield cmm)
         _ <- codeOutput dflags no_mod no_loc NoStubs [] rawCmms
         return ()
   where
@@ -1350,24 +1357,52 @@ tryNewCodeGen   :: HscEnv -> Module -> [TyCon]
                 -> CollectedCCs
                 -> [(StgBinding,[(Id,[Id])])]
                 -> HpcInfo
-                -> IO [Old.CmmGroup]
+                -> IO (Stream IO Old.CmmGroup ())
+         -- Note we produce a 'Stream' of CmmGroups, so that the
+         -- backend can be run incrementally.  Otherwise it generates all
+         -- the C-- up front, which has a significant space cost.
 tryNewCodeGen hsc_env this_mod data_tycons
               cost_centre_info stg_binds hpc_info = do
     let dflags = hsc_dflags hsc_env
-    prog <- StgCmm.codeGen dflags this_mod data_tycons
+
+    let cmm_stream :: Stream IO New.CmmGroup ()
+        cmm_stream = {-# SCC "StgCmm" #-}
+            StgCmm.codeGen dflags this_mod data_tycons
                            cost_centre_info stg_binds hpc_info
-    dumpIfSet_dyn dflags Opt_D_dump_cmmz "Cmm produced by new codegen"
-                  (pprCmms prog)
+
+        -- codegen consumes a stream of CmmGroup, and produces a new
+        -- stream of CmmGroup (not necessarily synchronised: one
+        -- CmmGroup on input may produce many CmmGroups on output due
+        -- to proc-point splitting).
+
+    let dump1 a = do dumpIfSet_dyn dflags Opt_D_dump_cmmz
+                       "Cmm produced by new codegen" (ppr a)
+                     return a
+
+        ppr_stream1 = Stream.mapM dump1 cmm_stream
 
     -- We are building a single SRT for the entire module, so
     -- we must thread it through all the procedures as we cps-convert them.
     us <- mkSplitUniqSupply 'S'
     let initTopSRT = initUs_ us emptySRT
-    (topSRT, prog) <- foldM (cmmPipeline hsc_env) (initTopSRT, []) prog
 
-    let prog' = map cmmOfZgraph (srtToData topSRT : prog)
-    dumpIfSet_dyn dflags Opt_D_dump_cmmz "Output Cmm" (ppr prog')
-    return prog'
+    let run_pipeline topSRT cmmgroup = do
+           (topSRT, cmmgroup) <- cmmPipeline hsc_env topSRT cmmgroup
+           return (topSRT,cmmOfZgraph cmmgroup)
+
+    let pipeline_stream = {-# SCC "cmmPipeline" #-} do
+          topSRT <- Stream.mapAccumL run_pipeline initTopSRT ppr_stream1
+          Stream.yield (cmmOfZgraph (srtToData topSRT))
+
+    let
+        dump2 a = do dumpIfSet_dyn dflags Opt_D_dump_cmmz "Output Cmm" $ ppr a
+                     return a
+
+        ppr_stream2 = Stream.mapM dump2 pipeline_stream
+
+    return ppr_stream2
+
+
 
 myCoreToStg :: DynFlags -> Module -> CoreProgram
             -> IO ( [(StgBinding,[(Id,[Id])])] -- output program
diff --git a/compiler/main/HscTypes.lhs b/compiler/main/HscTypes.lhs
index 1631e8ccaf..adaa9a3171 100644
--- a/compiler/main/HscTypes.lhs
+++ b/compiler/main/HscTypes.lhs
@@ -44,6 +44,7 @@ module HscTypes (
         InteractiveContext(..), emptyInteractiveContext,
         icPrintUnqual, icInScopeTTs, icPlusGblRdrEnv,
         extendInteractiveContext, substInteractiveContext,
+        setInteractivePrintName,
         InteractiveImport(..),
         mkPrintUnqualified, pprModulePrefix,
 
@@ -136,12 +137,11 @@ import Annotations
 import Class
 import TyCon
 import DataCon
-import PrelNames        ( gHC_PRIM, ioTyConName )
+import PrelNames        ( gHC_PRIM, ioTyConName, printName )
 import Packages hiding  ( Version(..) )
 import DynFlags
 import DriverPhases
 import BasicTypes
-import OptimizationFuel ( OptFuelState )
 import IfaceSyn
 import CoreSyn          ( CoreRule, CoreVect )
 import Maybes
@@ -317,11 +317,6 @@ data HscEnv
                 -- ^ This caches the location of modules, so we don't have to
                 -- search the filesystem multiple times. See also 'hsc_FC'.
 
-        hsc_OptFuel :: OptFuelState,
-                -- ^ Settings to control the use of \"optimization fuel\":
-                -- by limiting the number of transformations,
-                -- we can use binary search to help find compiler bugs.
-
         hsc_type_env_var :: Maybe (Module, IORef TypeEnv)
                 -- ^ Used for one-shot compilation only, to initialise
                 -- the 'IfGblEnv'. See 'TcRnTypes.tcg_type_env_var' for
@@ -943,6 +938,10 @@ data InteractiveContext
 
          ic_fix_env :: FixityEnv,
             -- ^ Fixities declared in let statements
+         
+         ic_int_print  :: Name,
+             -- ^ The function that is used for printing results
+             -- of expressions in ghci and -e mode.
 
 #ifdef GHCI
           ic_resume :: [Resume],
@@ -986,6 +985,8 @@ emptyInteractiveContext dflags
                          ic_sys_vars   = [],
                          ic_instances  = ([],[]),
                          ic_fix_env    = emptyNameEnv,
+                         -- System.IO.print by default
+                         ic_int_print  = printName,
 #ifdef GHCI
                          ic_resume     = [],
 #endif
@@ -1020,6 +1021,9 @@ extendInteractiveContext ictxt new_tythings
 
     new_names = [ nameOccName (getName id) | AnId id <- new_tythings ]
 
+setInteractivePrintName :: InteractiveContext -> Name -> InteractiveContext
+setInteractivePrintName ic n = ic{ic_int_print = n}
+
     -- ToDo: should not add Ids to the gbl env here
 
 -- | Add TyThings to the GlobalRdrEnv, earlier ones in the list shadowing
@@ -1090,7 +1094,7 @@ exposed (say P2), so we use M.T for that, and P1:M.T for the other one.
 This is handled by the qual_mod component of PrintUnqualified, inside
 the (ppr mod) of case (3), in Name.pprModulePrefix
 
-\begin{code}
+    \begin{code}
 -- | Creates some functions that work out the best ways to format
 -- names for the user according to a set of heuristics
 mkPrintUnqualified :: DynFlags -> GlobalRdrEnv -> PrintUnqualified
diff --git a/compiler/main/Packages.lhs b/compiler/main/Packages.lhs
index 9831367fff..5bea131088 100644
--- a/compiler/main/Packages.lhs
+++ b/compiler/main/Packages.lhs
@@ -767,7 +767,9 @@ mkPackageState dflags pkgs0 preload0 this_package = do
   --
   let preload1 = [ installedPackageId p | f <- flags, p <- get_exposed f ]
 
-      get_exposed (ExposePackage   s) = filter (matchingStr s) pkgs2
+      get_exposed (ExposePackage   s)
+         = take 1 $ sortByVersion (filter (matchingStr s) pkgs2)
+         --  -package P means "the latest version of P" (#7030)
       get_exposed (ExposePackageId s) = filter (matchingId  s) pkgs2
       get_exposed _                   = []
 
diff --git a/compiler/main/StaticFlagParser.hs b/compiler/main/StaticFlagParser.hs
index 88e92a7c03..b927f12d2c 100644
--- a/compiler/main/StaticFlagParser.hs
+++ b/compiler/main/StaticFlagParser.hs
@@ -9,7 +9,11 @@
 --
 -----------------------------------------------------------------------------
 
-module StaticFlagParser (parseStaticFlags) where
+module StaticFlagParser (
+        parseStaticFlags,
+        parseStaticFlagsFull,
+        flagsStatic
+    ) where
 
 #include "HsVersions.h"
 
@@ -46,11 +50,18 @@ import Data.List
 -- XXX: can we add an auto-generated list of static flags here?
 --
 parseStaticFlags :: [Located String] -> IO ([Located String], [Located String])
-parseStaticFlags args = do
+parseStaticFlags = parseStaticFlagsFull flagsStatic
+
+-- | Parse GHC's static flags as @parseStaticFlags@ does. However it also
+-- takes a list of available static flags, such that certain flags can be
+-- enabled or disabled through this argument.
+parseStaticFlagsFull :: [Flag IO] -> [Located String]
+                     -> IO ([Located String], [Located String])
+parseStaticFlagsFull flagsAvailable args = do
   ready <- readIORef v_opt_C_ready
   when ready $ ghcError (ProgramError "Too late for parseStaticFlags: call it before newSession")
 
-  (leftover, errs, warns1) <- processArgs static_flags args
+  (leftover, errs, warns1) <- processArgs flagsAvailable args
   when (not (null errs)) $ ghcError $ errorsToGhcException errs
 
     -- deal with the way flags: the way (eg. prof) gives rise to
@@ -62,8 +73,10 @@ parseStaticFlags args = do
   let unreg_flags | cGhcUnregisterised == "YES" = unregFlags
                   | otherwise = []
 
+    -- as these are GHC generated flags, we parse them with all static flags
+    -- in scope, regardless of what availableFlags are passed in.
   (more_leftover, errs, warns2) <-
-      processArgs static_flags (unreg_flags ++ way_flags')
+      processArgs flagsStatic (unreg_flags ++ way_flags')
 
     -- see sanity code in staticOpts
   writeIORef v_opt_C_ready True
@@ -88,7 +101,7 @@ parseStaticFlags args = do
   return (excess_prec ++ cg_flags ++ more_leftover ++ leftover,
           warns1 ++ warns2)
 
-static_flags :: [Flag IO]
+flagsStatic :: [Flag IO]
 -- All the static flags should appear in this list.  It describes how each
 -- static flag should be processed.  Two main purposes:
 -- (a) if a command-line flag doesn't appear in the list, GHC can complain
@@ -102,7 +115,7 @@ static_flags :: [Flag IO]
 -- is a prefix flag (i.e. HasArg, Prefix, OptPrefix, AnySuffix) will override
 -- flags further down the list with the same prefix.
 
-static_flags = [
+flagsStatic = [
         ------- ways --------------------------------------------------------
     Flag "prof"           (NoArg (addWay WayProf))
   , Flag "eventlog"       (NoArg (addWay WayEventLog))
diff --git a/compiler/main/StaticFlags.hs b/compiler/main/StaticFlags.hs
index 3a4c2da9e4..4695d83ed0 100644
--- a/compiler/main/StaticFlags.hs
+++ b/compiler/main/StaticFlags.hs
@@ -98,6 +98,7 @@ import Maybes		( firstJusts )
 import Panic
 
 import Control.Monad    ( liftM3 )
+import Data.Function
 import Data.Maybe       ( listToMaybe )
 import Data.IORef
 import System.IO.Unsafe	( unsafePerformIO )
diff --git a/compiler/main/SysTools.lhs b/compiler/main/SysTools.lhs
index e44338918a..49314f2823 100644
--- a/compiler/main/SysTools.lhs
+++ b/compiler/main/SysTools.lhs
@@ -499,8 +499,8 @@ runClang dflags args = do
         runSomething dflags "Clang (Assembler)" clang args
     )
     (\(err :: SomeException) -> do
-        putMsg dflags $ text $ "Error running clang! you need clang installed"
-                            ++ " to use the LLVM backend"
+        errorMsg dflags $ text $ "Error running clang! you need clang installed"
+                              ++ " to use the LLVM backend"
         throw err
     )
 
@@ -538,7 +538,7 @@ figureLlvmVersion dflags = do
                 debugTraceMsg dflags 2
                     (text "Error (figuring out LLVM version):" <+>
                      text (show err))
-                putMsg dflags $ vcat
+                errorMsg dflags $ vcat
                     [ text "Warning:", nest 9 $
                           text "Couldn't figure out LLVM version!" $$
                           text "Make sure you have installed LLVM"]
diff --git a/compiler/main/TidyPgm.lhs b/compiler/main/TidyPgm.lhs
index 5cd3f76250..8e4e7dd0a0 100644
--- a/compiler/main/TidyPgm.lhs
+++ b/compiler/main/TidyPgm.lhs
@@ -56,6 +56,7 @@ import Util
 import FastString
 
 import Control.Monad
+import Data.Function
 import Data.List        ( sortBy )
 import Data.IORef       ( readIORef, writeIORef )
 \end{code}
@@ -881,7 +882,7 @@ dffvLetBndr vanilla_unfold id
             -- but I've seen cases where we had a wrapper id $w but a
             -- rhs where $w had been inlined; see Trac #3922
 
-    go_unf (DFunUnfolding _ _ args) = mapM_ dffvExpr args
+    go_unf (DFunUnfolding _ _ args) = mapM_ dffvExpr (dfunArgExprs args)
     go_unf _ = return ()
 
     go_rule (BuiltinRule {}) = return ()
diff --git a/compiler/nativeGen/AsmCodeGen.lhs b/compiler/nativeGen/AsmCodeGen.lhs
index 51adf46005..4b49fe304e 100644
--- a/compiler/nativeGen/AsmCodeGen.lhs
+++ b/compiler/nativeGen/AsmCodeGen.lhs
@@ -71,6 +71,8 @@ import FastString
 import UniqSet
 import ErrUtils
 import Module
+import Stream (Stream)
+import qualified Stream
 
 -- DEBUGGING ONLY
 --import OrdList
@@ -147,7 +149,7 @@ data NcgImpl statics instr jumpDest = NcgImpl {
     }
 
 --------------------
-nativeCodeGen :: DynFlags -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
+nativeCodeGen :: DynFlags -> Handle -> UniqSupply -> Stream IO RawCmmGroup () -> IO ()
 nativeCodeGen dflags h us cmms
  = let platform = targetPlatform dflags
        nCG' :: (Outputable statics, Outputable instr, Instruction instr) => NcgImpl statics instr jumpDest -> IO ()
@@ -209,16 +211,16 @@ nativeCodeGen dflags h us cmms
 nativeCodeGen' :: (Outputable statics, Outputable instr, Instruction instr)
                => DynFlags
                -> NcgImpl statics instr jumpDest
-               -> Handle -> UniqSupply -> [RawCmmGroup] -> IO ()
+               -> Handle -> UniqSupply -> Stream IO RawCmmGroup () -> IO ()
 nativeCodeGen' dflags ncgImpl h us cmms
  = do
         let platform = targetPlatform dflags
-            split_cmms  = concat $ map add_split cmms
+            split_cmms  = Stream.map add_split cmms
         -- BufHandle is a performance hack.  We could hide it inside
         -- Pretty if it weren't for the fact that we do lots of little
         -- printDocs here (in order to do codegen in constant space).
         bufh <- newBufHandle h
-        (imports, prof) <- cmmNativeGens dflags ncgImpl bufh us split_cmms [] [] 0
+        (imports, prof) <- cmmNativeGenStream dflags ncgImpl bufh us split_cmms [] [] 0
         bFlush bufh
 
         let (native, colorStats, linearStats)
@@ -272,6 +274,34 @@ nativeCodeGen' dflags ncgImpl h us cmms
         split_marker = CmmProc Nothing mkSplitMarkerLabel (ListGraph [])
 
 
+cmmNativeGenStream :: (Outputable statics, Outputable instr, Instruction instr)
+              => DynFlags
+              -> NcgImpl statics instr jumpDest
+              -> BufHandle
+              -> UniqSupply
+              -> Stream IO RawCmmGroup ()
+              -> [[CLabel]]
+              -> [ ([NatCmmDecl statics instr],
+                   Maybe [Color.RegAllocStats statics instr],
+                   Maybe [Linear.RegAllocStats]) ]
+              -> Int
+              -> IO ( [[CLabel]],
+                      [([NatCmmDecl statics instr],
+                      Maybe [Color.RegAllocStats statics instr],
+                      Maybe [Linear.RegAllocStats])] )
+
+cmmNativeGenStream dflags ncgImpl h us cmm_stream impAcc profAcc count
+ = do
+        r <- Stream.runStream cmm_stream
+        case r of
+          Left () -> return (reverse impAcc, reverse profAcc)
+          Right (cmms, cmm_stream') -> do
+            (impAcc,profAcc,us') <- cmmNativeGens dflags ncgImpl h us cmms
+                                              impAcc profAcc count
+            cmmNativeGenStream dflags ncgImpl h us' cmm_stream'
+                                              impAcc profAcc count
+
+
 -- | Do native code generation on all these cmms.
 --
 cmmNativeGens :: (Outputable statics, Outputable instr, Instruction instr)
@@ -287,11 +317,12 @@ cmmNativeGens :: (Outputable statics, Outputable instr, Instruction instr)
               -> Int
               -> IO ( [[CLabel]],
                       [([NatCmmDecl statics instr],
-                      Maybe [Color.RegAllocStats statics instr],
-                      Maybe [Linear.RegAllocStats])] )
+                       Maybe [Color.RegAllocStats statics instr],
+                       Maybe [Linear.RegAllocStats])],
+                      UniqSupply )
 
-cmmNativeGens _ _ _ _ [] impAcc profAcc _
-        = return (reverse impAcc, reverse profAcc)
+cmmNativeGens _ _ _ us [] impAcc profAcc _
+        = return (impAcc,profAcc,us)
 
 cmmNativeGens dflags ncgImpl h us (cmm : cmms) impAcc profAcc count
  = do
@@ -817,7 +848,11 @@ Ideas for other things we could do (put these in Hoopl please!):
 cmmToCmm :: DynFlags -> RawCmmDecl -> (RawCmmDecl, [CLabel])
 cmmToCmm _ top@(CmmData _ _) = (top, [])
 cmmToCmm dflags (CmmProc info lbl (ListGraph blocks)) = runCmmOpt dflags $ do
-  blocks' <- mapM cmmBlockConFold (cmmMiniInline dflags (cmmEliminateDeadBlocks blocks))
+  let reachable_blocks | dopt Opt_TryNewCodeGen dflags = blocks
+                       | otherwise = cmmEliminateDeadBlocks blocks
+      -- The new codegen path has already eliminated unreachable blocks by now
+
+  blocks' <- mapM cmmBlockConFold (cmmMiniInline dflags reachable_blocks)
   return $ CmmProc info lbl (ListGraph blocks')
 
 newtype CmmOptM a = CmmOptM (([CLabel], DynFlags) -> (# a, [CLabel] #))
@@ -911,7 +946,8 @@ cmmExprConFold referenceKind expr = do
     dflags <- getDynFlags
     -- Skip constant folding if new code generator is running
     -- (this optimization is done in Hoopl)
-    let expr' = if dopt Opt_TryNewCodeGen dflags
+    -- SDM: re-enabled for now, while cmmRewriteAssignments is turned off
+    let expr' = if False -- dopt Opt_TryNewCodeGen dflags
                     then expr
                     else cmmExprCon (targetPlatform dflags) expr
     cmmExprNative referenceKind expr'
diff --git a/compiler/nativeGen/RegAlloc/Linear/Main.hs b/compiler/nativeGen/RegAlloc/Linear/Main.hs
index 8c38fd1de6..44fc8ef896 100644
--- a/compiler/nativeGen/RegAlloc/Linear/Main.hs
+++ b/compiler/nativeGen/RegAlloc/Linear/Main.hs
@@ -293,7 +293,7 @@ processBlock
         -> RegM freeRegs [NatBasicBlock instr]   -- ^ block with registers allocated
 
 processBlock platform block_live (BasicBlock id instrs)
- = do   initBlock id
+ = do   initBlock id block_live
         (instrs', fixups)
                 <- linearRA platform block_live [] [] id instrs
         return  $ BasicBlock id instrs' : fixups
@@ -301,16 +301,22 @@ processBlock platform block_live (BasicBlock id instrs)
 
 -- | Load the freeregs and current reg assignment into the RegM state
 --      for the basic block with this BlockId.
-initBlock :: FR freeRegs => BlockId -> RegM freeRegs ()
-initBlock id
+initBlock :: FR freeRegs => BlockId -> BlockMap RegSet -> RegM freeRegs ()
+initBlock id block_live
  = do   block_assig     <- getBlockAssigR
         case mapLookup id block_assig of
-                -- no prior info about this block: assume everything is
-                -- free and the assignment is empty.
+                -- no prior info about this block: we must consider
+                -- any fixed regs to be allocated, but we can ignore
+                -- virtual regs (presumably this is part of a loop,
+                -- and we'll iterate again).  The assignment begins
+                -- empty.
                 Nothing
                  -> do  -- pprTrace "initFreeRegs" (text $ show initFreeRegs) (return ())
-
-                        setFreeRegsR    frInitFreeRegs
+                        case mapLookup id block_live of
+                          Nothing ->
+                            setFreeRegsR    frInitFreeRegs
+                          Just live ->
+                            setFreeRegsR $ foldr frAllocateReg frInitFreeRegs [ r | RegReal r <- uniqSetToList live ]
                         setAssigR       emptyRegMap
 
                 -- load info about register assignments leading into this block.
@@ -385,7 +391,7 @@ raInsn platform block_live new_instrs id (LiveInstr (Instr instr) (Just live))
         Just (src,dst)  | src `elementOfUniqSet` (liveDieRead live),
                           isVirtualReg dst,
                           not (dst `elemUFM` assig),
-                          Just (InReg _) <- (lookupUFM assig src) -> do
+                          isRealReg src || isInReg src assig -> do
            case src of
               (RegReal rr) -> setAssigR (addToUFM assig dst (InReg rr))
                 -- if src is a fixed reg, then we just map dest to this
@@ -414,6 +420,11 @@ raInsn _ _ _ _ instr
         = pprPanic "raInsn" (text "no match for:" <> ppr instr)
 
 
+isInReg :: Reg -> RegMap Loc -> Bool
+isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
+                  | otherwise = False
+
+
 genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr)
           => Platform
           -> BlockMap RegSet
@@ -441,7 +452,7 @@ genRaInsn platform block_live new_instrs block_id instr r_dying w_dying =
     -- debugging
 {-    freeregs <- getFreeRegsR
     assig    <- getAssigR
-    pprTrace "genRaInsn"
+    pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform }) trace "genRaInsn"
         (ppr instr
                 $$ text "r_dying      = " <+> ppr r_dying
                 $$ text "w_dying      = " <+> ppr w_dying
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index d480f78b1d..65a3dd7f57 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -166,11 +166,15 @@ stmtToInstrs stmt = do
     CmmBranch id          -> genBranch id
     CmmCondBranch arg id  -> genCondJump id arg
     CmmSwitch arg ids     -> genSwitch arg ids
-    CmmJump arg _         -> genJump arg
+    CmmJump arg gregs     -> genJump arg (jumpRegs gregs)
     CmmReturn             ->
       panic "stmtToInstrs: return statement should have been cps'd away"
 
 
+jumpRegs :: Maybe [GlobalReg] -> [Reg]
+jumpRegs Nothing      = allHaskellArgRegs
+jumpRegs (Just gregs) = [ RegReal r | Just r <- map globalRegMaybe gregs ]
+
 --------------------------------------------------------------------------------
 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
 --      They are really trees of insns to facilitate fast appending, where a
@@ -928,9 +932,9 @@ getNonClobberedReg expr = do
         tmp <- getNewRegNat rep
         return (tmp, code tmp)
     Fixed rep reg code
-        -- only free regs can be clobbered
-        | RegReal (RealRegSingle rr) <- reg
-        , isFastTrue (freeReg rr)
+        -- only certain regs can be clobbered
+        | RegReal real <- reg
+        , real `elem` instrClobberedRegs
         -> do
                 tmp <- getNewRegNat rep
                 return (tmp, code `snocOL` reg2reg rep reg tmp)
@@ -1408,18 +1412,18 @@ assignReg_FltCode _ reg src = do
   return (src_code (getRegisterReg use_sse2 reg))
 
 
-genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
 
-genJump (CmmLoad mem _) = do
+genJump (CmmLoad mem _) regs = do
   Amode target code <- getAmode mem
-  return (code `snocOL` JMP (OpAddr target))
+  return (code `snocOL` JMP (OpAddr target) regs)
 
-genJump (CmmLit lit) = do
-  return (unitOL (JMP (OpImm (litToImm lit))))
+genJump (CmmLit lit) regs = do
+  return (unitOL (JMP (OpImm (litToImm lit)) regs))
 
-genJump expr = do
+genJump expr regs = do
   (reg,code) <- getSomeReg expr
-  return (code `snocOL` JMP (OpReg reg))
+  return (code `snocOL` JMP (OpReg reg) regs)
 
 
 -- -----------------------------------------------------------------------------
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 18adee9915..f31bf0349f 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -287,7 +287,7 @@ data Instr
         --  | POPA
 
         -- Jumping around.
-        | JMP         Operand
+        | JMP         Operand [Reg] -- including live Regs at the call
         | JXX         Cond BlockId  -- includes unconditional branches
         | JXX_GBL     Cond Imm      -- non-local version of JXX
         -- Table jump
@@ -357,7 +357,7 @@ x86_regUsageOfInstr instr
     SETCC  _ op         -> mkRU [] (def_W op)
     JXX    _ _          -> mkRU [] []
     JXX_GBL _ _         -> mkRU [] []
-    JMP     op          -> mkRUR (use_R op [])
+    JMP     op regs     -> mkRUR (use_R op regs)
     JMP_TBL op _ _ _    -> mkRUR (use_R op [])
     CALL (Left _)  params   -> mkRU params callClobberedRegs
     CALL (Right reg) params -> mkRU (reg:params) callClobberedRegs
@@ -492,7 +492,7 @@ x86_patchRegsOfInstr instr env
     PUSH sz op          -> patch1 (PUSH sz) op
     POP  sz op          -> patch1 (POP  sz) op
     SETCC cond op       -> patch1 (SETCC cond) op
-    JMP op              -> patch1 JMP op
+    JMP op regs         -> JMP (patchOp op) regs
     JMP_TBL op ids s lbl-> JMP_TBL (patchOp op) ids s lbl
 
     GMOV src dst        -> GMOV (env src) (env dst)
@@ -759,7 +759,7 @@ i386_insert_ffrees blocks
      = BasicBlock id (foldr p [] insns)
      where p insn r = case insn of
                         CALL _ _ -> GFREE : insn : r
-                        JMP _    -> GFREE : insn : r
+                        JMP _ _  -> GFREE : insn : r
                         JXX_GBL _ _ -> panic "i386_insert_ffrees: cannot handle JXX_GBL"
                         _        -> insn : r
 
@@ -800,9 +800,9 @@ getJumpDestBlockId (DestBlockId bid) = Just bid
 getJumpDestBlockId _                 = Nothing
 
 canShortcut :: Instr -> Maybe JumpDest
-canShortcut (JXX ALWAYS id)    = Just (DestBlockId id)
-canShortcut (JMP (OpImm imm))  = Just (DestImm imm)
-canShortcut _                  = Nothing
+canShortcut (JXX ALWAYS id)      = Just (DestBlockId id)
+canShortcut (JMP (OpImm imm) _)  = Just (DestImm imm)
+canShortcut _                    = Nothing
 
 
 -- This helper shortcuts a sequence of branches.
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 02f8efddae..c460a9cce7 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -595,9 +595,9 @@ pprInstr platform (JXX cond blockid)
 
 pprInstr platform (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm platform imm)
 
-pprInstr platform (JMP (OpImm imm)) = (<>) (ptext (sLit "\tjmp ")) (pprImm platform imm)
-pprInstr platform (JMP op)          = (<>) (ptext (sLit "\tjmp *")) (pprOperand platform (archWordSize (target32Bit platform)) op)
-pprInstr platform (JMP_TBL op _ _ _)  = pprInstr platform (JMP op)
+pprInstr platform (JMP (OpImm imm) _) = (<>) (ptext (sLit "\tjmp ")) (pprImm platform imm)
+pprInstr platform (JMP op _)          = (<>) (ptext (sLit "\tjmp *")) (pprOperand platform (archWordSize (target32Bit platform)) op)
+pprInstr platform (JMP_TBL op _ _ _)  = pprInstr platform (JMP op [])
 pprInstr platform (CALL (Left imm) _)    = (<>) (ptext (sLit "\tcall ")) (pprImm platform imm)
 pprInstr platform (CALL (Right reg) _)   = (<>) (ptext (sLit "\tcall *")) (pprReg platform (archWordSize (target32Bit platform)) reg)
 
diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs
index 395f9140bd..b2b6a3413a 100644
--- a/compiler/nativeGen/X86/Regs.hs
+++ b/compiler/nativeGen/X86/Regs.hs
@@ -17,7 +17,9 @@ module X86.Regs (
         argRegs,
         allArgRegs,
         allIntArgRegs,
+        allHaskellArgRegs,
         callClobberedRegs,
+        instrClobberedRegs,
         allMachRegNos,
         classOfRealReg,
         showReg,
@@ -56,6 +58,7 @@ import RegClass
 
 import BlockId
 import OldCmm
+import CmmCallConv
 import CLabel           ( CLabel )
 import Outputable
 import Platform
@@ -468,6 +471,11 @@ callClobberedRegs       :: [Reg]
 freeReg esp = fastBool False  --        %esp is the C stack pointer
 #endif
 
+#if i386_TARGET_ARCH
+freeReg esi = fastBool False -- Note [esi/edi not allocatable]
+freeReg edi = fastBool False
+#endif
+
 #if x86_64_TARGET_ARCH
 freeReg rsp = fastBool False  --        %rsp is the C stack pointer
 #endif
@@ -475,60 +483,9 @@ freeReg rsp = fastBool False  --        %rsp is the C stack pointer
 #ifdef REG_Base
 freeReg REG_Base = fastBool False
 #endif
-#ifdef REG_R1
-freeReg REG_R1   = fastBool False
-#endif
-#ifdef REG_R2
-freeReg REG_R2   = fastBool False
-#endif
-#ifdef REG_R3
-freeReg REG_R3   = fastBool False
-#endif
-#ifdef REG_R4
-freeReg REG_R4   = fastBool False
-#endif
-#ifdef REG_R5
-freeReg REG_R5   = fastBool False
-#endif
-#ifdef REG_R6
-freeReg REG_R6   = fastBool False
-#endif
-#ifdef REG_R7
-freeReg REG_R7   = fastBool False
-#endif
-#ifdef REG_R8
-freeReg REG_R8   = fastBool False
-#endif
-#ifdef REG_R9
-freeReg REG_R9   = fastBool False
-#endif
-#ifdef REG_R10
-freeReg REG_R10  = fastBool False
-#endif
-#ifdef REG_F1
-freeReg REG_F1 = fastBool False
-#endif
-#ifdef REG_F2
-freeReg REG_F2 = fastBool False
-#endif
-#ifdef REG_F3
-freeReg REG_F3 = fastBool False
-#endif
-#ifdef REG_F4
-freeReg REG_F4 = fastBool False
-#endif
-#ifdef REG_D1
-freeReg REG_D1 = fastBool False
-#endif
-#ifdef REG_D2
-freeReg REG_D2 = fastBool False
-#endif
 #ifdef REG_Sp
 freeReg REG_Sp   = fastBool False
 #endif
-#ifdef REG_Su
-freeReg REG_Su   = fastBool False
-#endif
 #ifdef REG_SpLim
 freeReg REG_SpLim = fastBool False
 #endif
@@ -538,7 +495,10 @@ freeReg REG_Hp   = fastBool False
 #ifdef REG_HpLim
 freeReg REG_HpLim = fastBool False
 #endif
-freeReg _               = fastBool True
+
+-- All other regs are considered to be "free", because we can track
+-- their liveness accurately.
+freeReg _         = fastBool True
 
 
 --  | Returns 'Nothing' if this global register is not stored
@@ -647,6 +607,20 @@ allFPArgRegs    = map regSingle [firstxmm .. firstxmm+7]
 
 #endif
 
+-- All machine registers that are used for argument-passing to Haskell functions
+allHaskellArgRegs :: [Reg]
+allHaskellArgRegs = [ RegReal r | Just r <- map globalRegMaybe globalArgRegs ]
+
+-- Machine registers which might be clobbered by instructions that
+-- generate results into fixed registers, or need arguments in a fixed
+-- register.
+instrClobberedRegs :: [RealReg]
+#if   i386_TARGET_ARCH
+instrClobberedRegs = map RealRegSingle [ eax, ecx, edx ]
+#elif x86_64_TARGET_ARCH
+instrClobberedRegs = map RealRegSingle [ rax, rcx, rdx ]
+#endif
+
 -- | these are the regs which we cannot assume stay alive over a C call.
 
 #if   i386_TARGET_ARCH
@@ -677,6 +651,11 @@ allIntArgRegs           = panic "X86.Regs.allIntArgRegs: not defined"
 allFPArgRegs            = panic "X86.Regs.allFPArgRegs: not defined"
 callClobberedRegs       = panic "X86.Regs.callClobberedRegs: not defined"
 
+instrClobberedRegs :: [RealReg]
+instrClobberedRegs = panic "X86.Regs.instrClobberedRegs: not defined for this arch"
+
+allHaskellArgRegs :: [Reg]
+allHaskellArgRegs = panic "X86.Regs.allHaskellArgRegs: not defined for this arch"
 
 #endif
 
@@ -688,4 +667,16 @@ allocatableRegs
    = let isFree i = isFastTrue (freeReg i)
      in  map RealRegSingle $ filter isFree allMachRegNos
 
+{-
+Note [esi/edi not allocatable]
 
+%esi is mapped to R1, so %esi would normally be allocatable while it
+is not being used for R1.  However, %esi has no 8-bit version on x86,
+and the linear register allocator is not sophisticated enough to
+handle this irregularity (we need more RegClasses).  The
+graph-colouring allocator also cannot handle this - it was designed
+with more flexibility in mind, but the current implementation is
+restricted to the same set of classes as the linear allocator.
+
+Hence, on x86 esi and edi are treated as not allocatable.
+-}
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 116db2526f..114f7f6b32 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -2002,7 +2002,7 @@ srcParseErr
   -> MsgDoc
 srcParseErr buf len
   = hcat [ if null token
-             then ptext (sLit "parse error (possibly incorrect indentation)")
+             then ptext (sLit "parse error (possibly incorrect indentation or mismatched brackets)")
              else hcat [ptext (sLit "parse error on input "),
                         char '`', text token, char '\'']
     ]
diff --git a/compiler/prelude/PrelRules.lhs b/compiler/prelude/PrelRules.lhs
index db45bac3d2..dab34fc69d 100644
--- a/compiler/prelude/PrelRules.lhs
+++ b/compiler/prelude/PrelRules.lhs
@@ -18,6 +18,8 @@ module PrelRules ( primOpRules, builtinRules ) where
 
 #include "HsVersions.h"
 
+import {-# SOURCE #-} MkId ( mkPrimOpId )
+
 import CoreSyn
 import MkCore
 import Id
@@ -659,7 +661,15 @@ builtinIntegerRules =
   rule_binop          "xorInteger"          xorIntegerName          xor,
   rule_unop           "complementInteger"   complementIntegerName   complement,
   rule_Int_binop      "shiftLInteger"       shiftLIntegerName       shiftL,
-  rule_Int_binop      "shiftRInteger"       shiftRIntegerName       shiftR]
+  rule_Int_binop      "shiftRInteger"       shiftRIntegerName       shiftR,
+  -- These rules below don't actually have to be built in, but if we
+  -- put them in the Haskell source then we'd have to duplicate them
+  -- between all Integer implementations
+  rule_smallIntegerToInt "smallIntegerToInt" integerToIntName,
+  rule_smallIntegerTo "smallIntegerToWord"   integerToWordName     Int2WordOp,
+  rule_smallIntegerTo "smallIntegerToFloat"  floatFromIntegerName  Int2FloatOp,
+  rule_smallIntegerTo "smallIntegerToDouble" doubleFromIntegerName Int2DoubleOp
+  ]
     where rule_convert str name convert
            = BuiltinRule { ru_name = fsLit str, ru_fn = name, ru_nargs = 1,
                            ru_try = match_Integer_convert convert }
@@ -702,6 +712,12 @@ builtinIntegerRules =
           rule_decodeDouble str name
            = BuiltinRule { ru_name = fsLit str, ru_fn = name, ru_nargs = 1,
                            ru_try = match_decodeDouble }
+          rule_smallIntegerToInt str name
+           = BuiltinRule { ru_name = fsLit str, ru_fn = name, ru_nargs = 1,
+                           ru_try = match_smallIntegerToInt }
+          rule_smallIntegerTo str name primOp
+           = BuiltinRule { ru_name = fsLit str, ru_fn = name, ru_nargs = 1,
+                           ru_try = match_smallIntegerTo primOp }
 
 ---------------------------------------------------
 -- The rule is this:
@@ -809,7 +825,7 @@ match_Word64ToInteger :: Id
                       -> [Expr CoreBndr]
                       -> Maybe (Expr CoreBndr)
 match_Word64ToInteger id id_unf [xl]
-  | Just (MachWord x) <- exprIsLiteral_maybe id_unf xl
+  | Just (MachWord64 x) <- exprIsLiteral_maybe id_unf xl
   = case idType id of
     FunTy _ integerTy ->
         Just (Lit (LitInteger x integerTy))
@@ -946,4 +962,23 @@ match_decodeDouble fn id_unf [xl]
     _ ->
         panic "match_decodeDouble: Id has the wrong type"
 match_decodeDouble _ _ _ = Nothing
+
+match_smallIntegerToInt :: Id
+                        -> IdUnfoldingFun
+                        -> [Expr CoreBndr]
+                        -> Maybe (Expr CoreBndr)
+match_smallIntegerToInt _ _ [App (Var x) y]
+  | idName x == smallIntegerName
+  = Just y
+match_smallIntegerToInt _ _ _ = Nothing
+
+match_smallIntegerTo :: PrimOp
+                     -> Id
+                     -> IdUnfoldingFun
+                     -> [Expr CoreBndr]
+                     -> Maybe (Expr CoreBndr)
+match_smallIntegerTo primOp _ _ [App (Var x) y]
+  | idName x == smallIntegerName
+  = Just $ App (Var (mkPrimOpId primOp)) y
+match_smallIntegerTo _ _ _ _ = Nothing
 \end{code}
diff --git a/compiler/prelude/PrimOp.lhs-boot b/compiler/prelude/PrimOp.lhs-boot
new file mode 100644
index 0000000000..5d003f2b51
--- /dev/null
+++ b/compiler/prelude/PrimOp.lhs-boot
@@ -0,0 +1,7 @@
+
+\begin{code}
+module PrimOp where
+
+data PrimOp
+\end{code}
+
diff --git a/compiler/rename/RnEnv.lhs b/compiler/rename/RnEnv.lhs
index 65b34ac709..6b01da4722 100644
--- a/compiler/rename/RnEnv.lhs
+++ b/compiler/rename/RnEnv.lhs
@@ -73,6 +73,7 @@ import ListSetOps	( removeDups )
 import DynFlags
 import FastString
 import Control.Monad
+import Data.List
 import qualified Data.Set as Set
 import Constants	( mAX_TUPLE_SIZE )
 \end{code}
@@ -1641,7 +1642,7 @@ dupNamesErr get_loc names
   where
     locs      = map get_loc names
     big_loc   = foldr1 combineSrcSpans locs
-    locations = ptext (sLit "Bound at:") <+> vcat (map ppr (sortLe (<=) locs))
+    locations = ptext (sLit "Bound at:") <+> vcat (map ppr (sort locs))
 
 kindSigErr :: Outputable a => a -> SDoc
 kindSigErr thing
diff --git a/compiler/simplCore/CoreMonad.lhs b/compiler/simplCore/CoreMonad.lhs
index 272bdfb71c..b1429c5dbf 100644
--- a/compiler/simplCore/CoreMonad.lhs
+++ b/compiler/simplCore/CoreMonad.lhs
@@ -95,9 +95,10 @@ import UniqSupply
 import UniqFM       ( UniqFM, mapUFM, filterUFM )
 import MonadUtils
 
-import Util		( split, sortLe )
+import Util ( split )
 import ListSetOps	( runs )
-import Data.List	( intersperse )
+import Data.List
+import Data.Ord
 import Data.Dynamic
 import Data.IORef
 import Data.Map (Map)
@@ -581,9 +582,8 @@ pprTickGroup :: [(Tick, Int)] -> SDoc
 pprTickGroup group@((tick1,_):_)
   = hang (int (sum [n | (_,n) <- group]) <+> text (tickString tick1))
        2 (vcat [ int n <+> pprTickCts tick  
-               | (tick,n) <- sortLe le group])
-  where
-    le (_,n1) (_,n2) = n2 <= n1   -- We want largest first
+                                    -- flip as we want largest first
+               | (tick,n) <- sortBy (flip (comparing snd)) group])
 pprTickGroup [] = panic "pprTickGroup"
 \end{code}
 
diff --git a/compiler/simplCore/Simplify.lhs b/compiler/simplCore/Simplify.lhs
index fca2f1fff9..115dd94bd4 100644
--- a/compiler/simplCore/Simplify.lhs
+++ b/compiler/simplCore/Simplify.lhs
@@ -731,7 +731,7 @@ simplUnfolding :: SimplEnv-> TopLevelFlag
 simplUnfolding env _ _ _ (DFunUnfolding ar con ops)
   = return (DFunUnfolding ar con ops')
   where
-    ops' = map (substExpr (text "simplUnfolding") env) ops
+    ops' = map (fmap (substExpr (text "simplUnfolding") env)) ops
 
 simplUnfolding env top_lvl id _
     (CoreUnfolding { uf_tmpl = expr, uf_arity = arity
diff --git a/compiler/simplStg/SRT.lhs b/compiler/simplStg/SRT.lhs
index bd2fb5e211..0d474c5b63 100644
--- a/compiler/simplStg/SRT.lhs
+++ b/compiler/simplStg/SRT.lhs
@@ -20,7 +20,7 @@ import Bitmap
 
 import Outputable
 
-import Util
+import Data.List
 \end{code}
 
 \begin{code}
@@ -148,7 +148,7 @@ constructSRT table (SRTEntries entries)
   where
     ints = map (expectJust "constructSRT" . lookupVarEnv table)
                 (varSetElems entries)
-    sorted_ints = sortLe (<=) ints
+    sorted_ints = sort ints
     offset = head sorted_ints
     bitmap_entries = map (subtract offset) sorted_ints
     len = last bitmap_entries + 1
diff --git a/compiler/simplStg/SimplStg.lhs b/compiler/simplStg/SimplStg.lhs
index d5024ab2e0..635df3ce41 100644
--- a/compiler/simplStg/SimplStg.lhs
+++ b/compiler/simplStg/SimplStg.lhs
@@ -55,7 +55,10 @@ stg2stg dflags module_name binds
 	; (processed_binds, _, cost_centres) 
 		<- foldl_mn do_stg_pass (binds', us0, ccs) (getStgToDo dflags)
 
-	; let srt_binds = computeSRTs (unarise us1 processed_binds)
+        ; let un_binds = unarise us1 processed_binds
+        ; let srt_binds
+               | dopt Opt_TryNewCodeGen dflags = zip un_binds (repeat [])
+               | otherwise = computeSRTs un_binds
 
 	; dumpIfSet_dyn dflags Opt_D_dump_stg "STG syntax:" 
 	     		(pprStgBindingsWithSRTs srt_binds)
diff --git a/compiler/specialise/Rules.lhs b/compiler/specialise/Rules.lhs
index 42c1eda081..498302a5e9 100644
--- a/compiler/specialise/Rules.lhs
+++ b/compiler/specialise/Rules.lhs
@@ -54,6 +54,7 @@ import Maybes
 import Bag
 import Util
 import Data.List
+import Data.Ord
 \end{code}
 
 Note [Overall plumbing for rules]
@@ -239,10 +240,8 @@ pprRulesForUser :: [CoreRule] -> SDoc
 pprRulesForUser rules
   = withPprStyle defaultUserStyle $
     pprRules $
-    sortLe le_rule  $
+    sortBy (comparing ru_name) $
     tidyRules emptyTidyEnv rules
-  where
-    le_rule r1 r2 = ru_name r1 <= ru_name r2
 \end{code}
 
 
diff --git a/compiler/stgSyn/StgLint.lhs b/compiler/stgSyn/StgLint.lhs
index ac394164b7..852202f5f7 100644
--- a/compiler/stgSyn/StgLint.lhs
+++ b/compiler/stgSyn/StgLint.lhs
@@ -26,6 +26,7 @@ import SrcLoc
 import Outputable
 import FastString
 import Control.Monad
+import Data.Function
 
 #include "HsVersions.h"
 \end{code}
diff --git a/compiler/typecheck/TcDeriv.lhs b/compiler/typecheck/TcDeriv.lhs
index bbda3cfcf0..dd797ab274 100644
--- a/compiler/typecheck/TcDeriv.lhs
+++ b/compiler/typecheck/TcDeriv.lhs
@@ -62,6 +62,7 @@ import FastString
 import Bag
 
 import Control.Monad
+import Data.List
 \end{code}
 
 %************************************************************************
@@ -1406,7 +1407,7 @@ inferInstanceContexts oflag infer_specs
 		-- Claim: the result instance declaration is guaranteed valid
 		-- Hence no need to call:
 		--   checkValidInstance tyvars theta clas inst_tys
-	   ; return (sortLe (\p1 p2 -> cmpType p1 p2 /= GT) theta) }	-- Canonicalise before returning the solution
+	   ; return (sortBy cmpType theta) }	-- Canonicalise before returning the solution
       where
         the_pred = mkClassPred clas inst_tys
 
diff --git a/compiler/typecheck/TcErrors.lhs b/compiler/typecheck/TcErrors.lhs
index dd20277372..c0762daeb2 100644
--- a/compiler/typecheck/TcErrors.lhs
+++ b/compiler/typecheck/TcErrors.lhs
@@ -684,6 +684,9 @@ mkDictErr ctxt cts
 
        -- Report definite no-instance errors, 
        -- or (iff there are none) overlap errors
+       -- But we report only one of them (hence 'head') becuase they all
+       -- have the same source-location origin, to try avoid a cascade
+       -- of error from one location
        ; (ctxt, err) <- mk_dict_err ctxt (head (no_inst_cts ++ overlap_cts))
        ; mkErrorReport ctxt err }
   where
diff --git a/compiler/typecheck/TcInstDcls.lhs b/compiler/typecheck/TcInstDcls.lhs
index 49c5131275..9eb747ad51 100644
--- a/compiler/typecheck/TcInstDcls.lhs
+++ b/compiler/typecheck/TcInstDcls.lhs
@@ -39,6 +39,7 @@ import TcEnv
 import TcHsType
 import TcUnify
 import MkCore     ( nO_METHOD_BINDING_ERROR_ID )
+import CoreSyn    ( DFunArg(..) )
 import Type
 import TcEvidence
 import TyCon
@@ -49,7 +50,7 @@ import VarEnv
 import VarSet     ( mkVarSet, subVarSet, varSetElems )
 import Pair
 import CoreUnfold ( mkDFunUnfolding )
-import CoreSyn    ( Expr(Var), CoreExpr, varToCoreExpr )
+import CoreSyn    ( Expr(Var), CoreExpr )
 import PrelNames  ( typeableClassNames )
 
 import Bag
@@ -731,13 +732,13 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = ibinds })
                      -- See Note [Subtle interaction of recursion and overlap]
                      -- and Note [Binding when looking up instances]
        ; let (clas, inst_tys) = tcSplitDFunHead inst_head
-             (class_tyvars, sc_theta, sc_sels, op_items) = classBigSig clas
+             (class_tyvars, sc_theta, _, op_items) = classBigSig clas
              sc_theta' = substTheta (zipOpenTvSubst class_tyvars inst_tys) sc_theta
+
        ; dfun_ev_vars <- newEvVars dfun_theta
 
-       ; (sc_args, sc_binds)
-             <- mapAndUnzipM (tcSuperClass inst_tyvars dfun_ev_vars)
-                              (sc_sels `zip` sc_theta')
+       ; (sc_binds, sc_ev_vars, sc_dfun_args) 
+            <- tcSuperClasses dfun_id inst_tyvars dfun_ev_vars sc_theta'
 
        -- Deal with 'SPECIALISE instance' pragmas
        -- See Note [SPECIALISE instance pragmas]
@@ -770,20 +771,14 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = ibinds })
                      --    con_app_args = MkD ty1 ty2 sc1 sc2 op1 op2
              con_app_tys  = wrapId (mkWpTyApps inst_tys)
                                    (dataConWrapId dict_constr)
-             con_app_scs  = mkHsWrap (mkWpEvApps (map mk_sc_ev_term sc_args)) con_app_tys
+             con_app_scs  = mkHsWrap (mkWpEvApps (map EvId sc_ev_vars)) con_app_tys
              con_app_args = foldl mk_app con_app_scs $
                             map (wrapId arg_wrapper) meth_ids
 
              mk_app :: HsExpr Id -> HsExpr Id -> HsExpr Id
              mk_app fun arg = HsApp (L loc fun) (L loc arg)
 
-             mk_sc_ev_term :: EvVar -> EvTerm
-             mk_sc_ev_term sc
-               | null inst_tv_tys
-               , null dfun_ev_vars = EvId sc
-               | otherwise         = EvDFunApp sc inst_tv_tys (map EvId dfun_ev_vars)
-
-             inst_tv_tys    = mkTyVarTys inst_tyvars
+             inst_tv_tys = mkTyVarTys inst_tyvars
              arg_wrapper = mkWpEvVarApps dfun_ev_vars <.> mkWpTyApps inst_tv_tys
 
                 -- Do not inline the dfun; instead give it a magic DFunFunfolding
@@ -796,9 +791,8 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = ibinds })
                 = dfun_id `setIdUnfolding`  mkDFunUnfolding dfun_ty dfun_args
                           `setInlinePragma` dfunInlinePragma
 
-             dfun_args :: [CoreExpr]
-             dfun_args = map varToCoreExpr sc_args ++
-                         map Var           meth_ids
+             dfun_args :: [DFunArg CoreExpr]
+             dfun_args = sc_dfun_args ++ map (DFunPolyArg . Var) meth_ids
 
              export = ABE { abe_wrap = idHsWrapper, abe_poly = dfun_id_w_fun
                           , abe_mono = self_dict, abe_prags = noSpecPrags }
@@ -806,12 +800,11 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = ibinds })
              main_bind = AbsBinds { abs_tvs = inst_tyvars
                                   , abs_ev_vars = dfun_ev_vars
                                   , abs_exports = [export]
-                                  , abs_ev_binds = emptyTcEvBinds
+                                  , abs_ev_binds = sc_binds
                                   , abs_binds = unitBag dict_bind }
 
        ; return (unitBag (L loc main_bind) `unionBags`
-                 listToBag meth_binds      `unionBags`
-                 unionManyBags sc_binds)
+                 listToBag meth_binds)
        }
  where
    dfun_ty   = idType dfun_id
@@ -819,6 +812,31 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = ibinds })
    loc       = getSrcSpan dfun_id
 
 ------------------------------
+tcSuperClasses :: DFunId -> [TcTyVar] -> [EvVar] -> TcThetaType
+               -> TcM (TcEvBinds, [EvVar], [DFunArg CoreExpr])
+-- See Note [Silent superclass arguments]
+tcSuperClasses dfun_id inst_tyvars dfun_ev_vars sc_theta
+  = do {   -- Check that all superclasses can be deduced from
+           -- the originally-specified dfun arguments
+       ; (sc_binds, sc_evs) <- checkConstraints InstSkol inst_tyvars orig_ev_vars $
+                               emitWanteds ScOrigin sc_theta
+
+       ; if null inst_tyvars && null dfun_ev_vars 
+         then return (sc_binds,       sc_evs,      map (DFunPolyArg . Var) sc_evs)
+         else return (emptyTcEvBinds, sc_lam_args, sc_dfun_args) }
+  where
+    n_silent     = dfunNSilent dfun_id
+    n_tv_args    = length inst_tyvars
+    orig_ev_vars = drop n_silent dfun_ev_vars
+
+    (sc_lam_args, sc_dfun_args) = unzip (map (find n_tv_args dfun_ev_vars) sc_theta)
+    find _ [] pred 
+      = pprPanic "tcInstDecl2" (ppr dfun_id $$ ppr (idType dfun_id) $$ ppr pred)
+    find i (ev:evs) pred 
+      | pred `eqPred` evVarPred ev = (ev, DFunLamArg i)
+      | otherwise                  = find (i+1) evs pred
+
+----------------------
 mkMethIds :: HsSigFun -> Class -> [TcTyVar] -> [EvVar] 
           -> [TcType] -> Id -> TcM (TcId, TcSigInfo)
 mkMethIds sig_fn clas tyvars dfun_ev_vars inst_tys sel_id
@@ -875,33 +893,6 @@ misplacedInstSig name hs_ty
          , ptext (sLit "(Use -XInstanceSigs to allow this)") ]
 
 ------------------------------
-tcSuperClass :: [TcTyVar] -> [EvVar]
-             -> (Id, PredType)
-             -> TcM (TcId, LHsBinds TcId)
-
--- Build a top level decl like
---      sc_op = /\a \d. let sc = ... in
---                      sc
--- and return sc_op, that binding
-
-tcSuperClass tyvars ev_vars (sc_sel, sc_pred)
-  = do { (ev_binds, sc_dict)
-             <- newImplication InstSkol tyvars ev_vars $
-                emitWanted ScOrigin sc_pred
-
-       ; uniq <- newUnique
-       ; let sc_op_ty   = mkForAllTys tyvars $ mkPiTypes ev_vars (varType sc_dict)
-             sc_op_name = mkDerivedInternalName mkClassOpAuxOcc uniq
-                                                (getName sc_sel)
-             sc_op_id   = mkLocalId sc_op_name sc_op_ty
-             sc_op_bind = mkVarBind sc_op_id (L noSrcSpan $ wrapId sc_wrapper sc_dict)
-             sc_wrapper = mkWpTyLams tyvars
-                          <.> mkWpLams ev_vars
-                          <.> mkWpLet ev_binds
-
-       ; return (sc_op_id, unitBag sc_op_bind) }
-
-------------------------------
 tcSpecInstPrags :: DFunId -> InstBindings Name
                 -> TcM ([Located TcSpecPrag], PragFun)
 tcSpecInstPrags _ (NewTypeDerived {})
@@ -913,8 +904,17 @@ tcSpecInstPrags dfun_id (VanillaInst binds uprags _)
        ; return (spec_inst_prags, mkPragFun uprags binds) }
 \end{code}
 
-Note [Superclass loop avoidance]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Note [Silent superclass arguments]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+See Trac #3731, #4809, #5751, #5913, #6117, which all
+describe somewhat more complicated situations, but ones
+encountered in practice.  
+
+      THE PROBLEM
+
+The problem is that it is all too easy to create a class whose
+superclass is bottom when it should not be.
+
 Consider the following (extreme) situation:
         class C a => D a where ...
         instance D [a] => D [a] where ...
@@ -929,10 +929,51 @@ argument:
        dfun :: forall a. D [a] -> D [a]
        dfun = \d::D [a] -> MkD (scsel d) ..
 
-Rather, we want to get it by finding an instance for (C [a]).  We
-achieve this by
-    not making the superclasses of a "wanted"
-    available for solving wanted constraints.
+Otherwise if we later encounter a situation where
+we have a [Wanted] dw::D [a] we might solve it thus:
+     dw := dfun dw
+Which is all fine except that now ** the superclass C is bottom **!
+
+      THE SOLUTION
+
+Our solution to this problem "silent superclass arguments".  We pass
+to each dfun some ``silent superclass arguments’’, which are the
+immediate superclasses of the dictionary we are trying to
+construct. In our example:
+       dfun :: forall a. C [a] -> D [a] -> D [a]
+       dfun = \(dc::C [a]) (dd::D [a]) -> DOrd dc ...
+Notice teh extra (dc :: C [a]) argument compared to the previous version.
+
+This gives us:
+
+     -----------------------------------------------------------
+     DFun Superclass Invariant
+     ~~~~~~~~~~~~~~~~~~~~~~~~
+     In the body of a DFun, every superclass argument to the
+     returned dictionary is
+       either   * one of the arguments of the DFun,
+       or       * constant, bound at top level
+     -----------------------------------------------------------
+
+This net effect is that it is safe to treat a dfun application as
+wrapping a dictionary constructor around its arguments (in particular,
+a dfun never picks superclasses from the arguments under the
+dictionary constructor). No superclass is hidden inside a dfun
+application.
+
+The extra arguments required to satisfy the DFun Superclass Invariant
+always come first, and are called the "silent" arguments.  DFun types
+are built (only) by MkId.mkDictFunId, so that is where we decide
+what silent arguments are to be added.
+
+In our example, if we had  [Wanted] dw :: D [a] we would get via the instance:
+    dw := dfun d1 d2
+    [Wanted] (d1 :: C [a])
+    [Wanted] (d2 :: D [a])
+
+And now, though we *can* solve: 
+     d2 := dw
+That's fine; and we solve d1:C[a] separately.
 
 Test case SCLoop tests this fix.
 
@@ -980,7 +1021,7 @@ tcSpecInst dfun_id prag@(SpecInstSig hs_ty)
   = addErrCtxt (spec_ctxt prag) $
     do  { let name = idName dfun_id
         ; (tyvars, theta, clas, tys) <- tcHsInstHead SpecInstCtxt hs_ty
-        ; let spec_dfun_ty = mkDictFunTy tyvars theta clas tys
+        ; let (_, spec_dfun_ty) = mkDictFunTy tyvars theta clas tys
 
         ; co_fn <- tcSubType (SpecPragOrigin name) SpecInstCtxt
                              (idType dfun_id) spec_dfun_ty
diff --git a/compiler/typecheck/TcRnDriver.lhs b/compiler/typecheck/TcRnDriver.lhs
index 778a4b266f..e3ec10a073 100644
--- a/compiler/typecheck/TcRnDriver.lhs
+++ b/compiler/typecheck/TcRnDriver.lhs
@@ -82,6 +82,7 @@ import TcType   ( orphNamesOfDFunHead )
 import Inst     ( tcGetInstEnvs )
 import Data.List ( sortBy )
 import Data.IORef ( readIORef )
+import Data.Ord
 
 #ifdef GHCI
 import TcType   ( isUnitTy, isTauTy )
@@ -1326,6 +1327,7 @@ tcUserStmt (L loc (ExprStmt expr _ _ _))
                -- Don't try to typecheck if the renamer fails!
         ; ghciStep <- getGhciStepIO
         ; uniq <- newUnique
+        ; interPrintName <- getInteractivePrintName
         ; let fresh_it  = itName uniq loc
               matches   = [mkMatch [] rn_expr emptyLocalBinds]
               -- [it = expr]
@@ -1344,7 +1346,7 @@ tcUserStmt (L loc (ExprStmt expr _ _ _))
                                            (HsVar bindIOName) noSyntaxExpr
 
               -- [; print it]
-              print_it  = L loc $ ExprStmt (nlHsApp (nlHsVar printName) (nlHsVar fresh_it))
+              print_it  = L loc $ ExprStmt (nlHsApp (nlHsVar interPrintName) (nlHsVar fresh_it))
                                            (HsVar thenIOName) noSyntaxExpr placeHolderType
 
         -- The plans are:
@@ -1879,17 +1881,15 @@ ppr_fam_insts fam_insts =
 ppr_sigs :: [Var] -> SDoc
 ppr_sigs ids
         -- Print type signatures; sort by OccName
-  = vcat (map ppr_sig (sortLe le_sig ids))
+  = vcat (map ppr_sig (sortBy (comparing getOccName) ids))
   where
-    le_sig id1 id2 = getOccName id1 <= getOccName id2
     ppr_sig id = hang (ppr id <+> dcolon) 2 (ppr (tidyTopType (idType id)))
 
 ppr_tydecls :: [TyCon] -> SDoc
 ppr_tydecls tycons
         -- Print type constructor info; sort by OccName
-  = vcat (map ppr_tycon (sortLe le_sig tycons))
+  = vcat (map ppr_tycon (sortBy (comparing getOccName) tycons))
   where
-    le_sig tycon1 tycon2 = getOccName tycon1 <= getOccName tycon2
     ppr_tycon tycon = vcat [ ppr (tyConName tycon) <+> dcolon <+> ppr (tyConKind tycon)
                               -- Temporarily print the kind signature too
                            , ppr (tyThingToIfaceDecl (ATyCon tycon)) ]
diff --git a/compiler/typecheck/TcRnMonad.lhs b/compiler/typecheck/TcRnMonad.lhs
index 8acd0db6f3..f68599898e 100644
--- a/compiler/typecheck/TcRnMonad.lhs
+++ b/compiler/typecheck/TcRnMonad.lhs
@@ -493,6 +493,9 @@ getIsGHCi = do { mod <- getModule; return (mod == iNTERACTIVE) }
 getGHCiMonad :: TcRn Name
 getGHCiMonad = do { hsc <- getTopEnv; return (ic_monad $ hsc_IC hsc) }
 
+getInteractivePrintName :: TcRn Name
+getInteractivePrintName = do { hsc <- getTopEnv; return (ic_int_print $ hsc_IC hsc) }
+
 tcIsHsBoot :: TcRn Bool
 tcIsHsBoot = do { env <- getGblEnv; return (isHsBoot (tcg_src env)) }
 
diff --git a/compiler/typecheck/TcSplice.lhs b/compiler/typecheck/TcSplice.lhs
index fac61afe65..4f3731ae0d 100644
--- a/compiler/typecheck/TcSplice.lhs
+++ b/compiler/typecheck/TcSplice.lhs
@@ -1296,12 +1296,13 @@ reifyClass cls
 ------------------------------
 reifyClassInstance :: ClsInst -> TcM TH.Dec
 reifyClassInstance i
-  = do { cxt <- reifyCxt theta
+  = do { cxt <- reifyCxt (drop n_silent theta)
        ; thtypes <- reifyTypes types
        ; let head_ty = foldl TH.AppT (TH.ConT (reifyName cls)) thtypes
        ; return $ (TH.InstanceD cxt head_ty []) }
   where
      (_tvs, theta, cls, types) = instanceHead i
+     n_silent = dfunNSilent (instanceDFunId i)
 
 ------------------------------
 reifyFamilyInstance :: FamInst -> TcM TH.Dec
diff --git a/compiler/types/InstEnv.lhs b/compiler/types/InstEnv.lhs
index 21e1acd3e7..388846b8ee 100644
--- a/compiler/types/InstEnv.lhs
+++ b/compiler/types/InstEnv.lhs
@@ -155,8 +155,15 @@ pprInstance ispec
 pprInstanceHdr :: ClsInst -> SDoc
 -- Prints the ClsInst as an instance declaration
 pprInstanceHdr (ClsInst { is_flag = flag, is_dfun = dfun })
-  = ptext (sLit "instance") <+> ppr flag <+> pprSigmaType (idType dfun)
-        -- Print without the for-all, which the programmer doesn't write
+  = getPprStyle $ \ sty ->
+    let theta_to_print
+          | debugStyle sty = theta
+          | otherwise = drop (dfunNSilent dfun) theta
+    in ptext (sLit "instance") <+> ppr flag
+       <+> sep [pprThetaArrowTy theta_to_print, ppr res_ty]
+  where
+    (_, theta, res_ty) = tcSplitSigmaTy (idType dfun)
+       -- Print without the for-all, which the programmer doesn't write
 
 pprInstances :: [ClsInst] -> SDoc
 pprInstances ispecs = vcat (map pprInstance ispecs)
diff --git a/compiler/utils/Digraph.lhs b/compiler/utils/Digraph.lhs
index f7bdff2612..9ae84a7897 100644
--- a/compiler/utils/Digraph.lhs
+++ b/compiler/utils/Digraph.lhs
@@ -47,7 +47,7 @@ module Digraph(
 ------------------------------------------------------------------------------
 
 
-import Util        ( sortLe, minWith, count )
+import Util        ( minWith, count )
 import Outputable
 import Maybes      ( expectJust )
 import MonadUtils  ( allM )
@@ -59,7 +59,8 @@ import Control.Monad.ST
 -- std interfaces
 import Data.Maybe
 import Data.Array
-import Data.List   ( (\\) )
+import Data.List hiding (transpose)
+import Data.Ord
 import Data.Array.ST
 import qualified Data.Map as Map
 import qualified Data.Set as Set
@@ -140,8 +141,7 @@ reduceNodesIntoVertices nodes key_extractor = (bounds, (!) vertex_map, key_verte
     max_v           = length nodes - 1
     bounds          = (0, max_v) :: (Vertex, Vertex)
 
-    sorted_nodes    = let n1 `le` n2 = (key_extractor n1 `compare` key_extractor n2) /= GT
-                      in sortLe le nodes
+    sorted_nodes    = sortBy (comparing key_extractor) nodes
     numbered_nodes  = zipWith (,) [0..] sorted_nodes
 
     key_map         = array bounds [(i, key_extractor node) | (i, node) <- numbered_nodes]
diff --git a/compiler/utils/ListSetOps.lhs b/compiler/utils/ListSetOps.lhs
index 0dc873eb62..077eae2574 100644
--- a/compiler/utils/ListSetOps.lhs
+++ b/compiler/utils/ListSetOps.lhs
@@ -11,16 +11,10 @@ module ListSetOps (
 
         -- Association lists
         Assoc, assoc, assocMaybe, assocUsing, assocDefault, assocDefaultUsing,
-        emptyAssoc, unitAssoc, mapAssoc, plusAssoc_C, extendAssoc_C,
-        mkLookupFun, findInList, assocElts,
 
         -- Duplicate handling
         hasNoDups, runs, removeDups, findDupsEq,
         equivClasses, equivClassesByUniq,
-
-        -- Remove redudant elts
-        removeRedundant    -- Used in the ghc/InteractiveUI, 
-                           -- although not in the compiler itself
    ) where
 
 #include "HsVersions.h"
@@ -71,22 +65,11 @@ Inefficient finite maps based on association lists and equality.
 -- A finite mapping based on equality and association lists
 type Assoc a b = [(a,b)]
 
-emptyAssoc        :: Assoc a b
-unitAssoc         :: a -> b -> Assoc a b
-assocElts         :: Assoc a b -> [(a,b)]
 assoc             :: (Eq a) => String -> Assoc a b -> a -> b
 assocDefault      :: (Eq a) => b -> Assoc a b -> a -> b
 assocUsing        :: (a -> a -> Bool) -> String -> Assoc a b -> a -> b
 assocMaybe        :: (Eq a) => Assoc a b -> a -> Maybe b
 assocDefaultUsing :: (a -> a -> Bool) -> b -> Assoc a b -> a -> b
-mapAssoc          :: (b -> c) -> Assoc a b -> Assoc a c
-extendAssoc_C     :: (Eq a) => (b -> b -> b) -> Assoc a b -> (a,b)     -> Assoc a b
-plusAssoc_C       :: (Eq a) => (b -> b -> b) -> Assoc a b -> Assoc a b -> Assoc a b
-        -- combining fn takes (old->new->result)
-
-emptyAssoc    = []
-unitAssoc a b = [(a,b)]
-assocElts xs  = xs
 
 assocDefaultUsing _  deflt []             _   = deflt
 assocDefaultUsing eq deflt ((k,v) : rest) key
@@ -102,45 +85,8 @@ assocMaybe alist key
   where
     lookup []             = Nothing
     lookup ((tv,ty):rest) = if key == tv then Just ty else lookup rest
-
-mapAssoc f alist = [(key, f val) | (key,val) <- alist]
-
-plusAssoc_C _       []  new = new -- Shortcut for common case
-plusAssoc_C combine old new = foldl (extendAssoc_C combine) old new
-
-extendAssoc_C combine old_list (new_key, new_val)
-  = go old_list
-  where
-    go [] = [(new_key, new_val)]
-    go ((old_key, old_val) : old_list)
-     | new_key == old_key = ((old_key, old_val `combine` new_val) : old_list)
-     | otherwise          = (old_key, old_val) : go old_list
-\end{code}
-
-
-@mkLookupFun eq alist@ is a function which looks up
-its argument in the association list @alist@, returning a Maybe type.
-@mkLookupFunDef@ is similar except that it is given a value to return
-on failure.
-
-\begin{code}
-mkLookupFun :: (key -> key -> Bool)     -- Equality predicate
-            -> [(key,val)]              -- The assoc list
-            -> key                      -- The key
-            -> Maybe val                -- The corresponding value
-
-mkLookupFun eq alist s
-  = case [a | (s',a) <- alist, s' `eq` s] of
-      []    -> Nothing
-      (a:_) -> Just a
-
-findInList :: (a -> Bool) -> [a] -> Maybe a
-findInList _ [] = Nothing
-findInList p (x:xs) | p x       = Just x
-                    | otherwise = findInList p xs
 \end{code}
 
-
 %************************************************************************
 %*                                                                      *
 \subsection[Utils-dups]{Duplicate-handling}
@@ -167,10 +113,9 @@ equivClasses :: (a -> a -> Ordering) -- Comparison
 
 equivClasses _         []  = []
 equivClasses _   stuff@[_] = [stuff]
-equivClasses cmp items     = runs eq (sortLe le items)
+equivClasses cmp items     = runs eq (sortBy cmp items)
   where
     eq a b = case cmp a b of { EQ -> True; _ -> False }
-    le a b = case cmp a b of { LT -> True; EQ -> True; GT -> False }
 \end{code}
 
 The first cases in @equivClasses@ above are just to cut to the point
@@ -212,22 +157,6 @@ findDupsEq _  [] = []
 findDupsEq eq (x:xs) | null eq_xs  = findDupsEq eq xs
                      | otherwise   = (x:eq_xs) : findDupsEq eq neq_xs
     where (eq_xs, neq_xs) = partition (eq x) xs
-
-removeRedundant :: (a -> a -> Bool)   -- True <=> discard the *second* argument
-                -> [a] -> [a]
--- Remove any element y for which 
---     another element x is in the list
--- and (x `subsumes` y)
--- Preserves order
-removeRedundant subsumes xs
-  = WARN( length xs > 10, text "removeRedundant" <+> int (length xs) )
-          -- This is a quadratic algorithm :-) so warn if the list gets long
-    go [] xs
-  where
-    go acc [] = reverse acc
-    go acc (x:xs) 
-       | any (`subsumes` x) acc = go acc xs
-       | otherwise              = go (x : filterOut (x `subsumes`) acc) xs
 \end{code}
 
 
diff --git a/compiler/utils/MonadUtils.hs b/compiler/utils/MonadUtils.hs
index 3108a03d64..6f15ecc03d 100644
--- a/compiler/utils/MonadUtils.hs
+++ b/compiler/utils/MonadUtils.hs
@@ -16,8 +16,6 @@ module MonadUtils
         , MonadFix(..)
         , MonadIO(..)
 	
-  	, ID, runID
-        
         , liftIO1, liftIO2, liftIO3, liftIO4
 
         , zipWith3M        
@@ -32,8 +30,6 @@ module MonadUtils
         , maybeMapM
         ) where
 
-import Outputable 
-
 -------------------------------------------------------------------------------
 -- Detection of available libraries
 -------------------------------------------------------------------------------
@@ -55,20 +51,6 @@ import Control.Monad
 import Control.Monad.Fix
 
 -------------------------------------------------------------------------------
--- The ID monad
--------------------------------------------------------------------------------
-
-newtype ID a = ID a
-instance Monad ID where
-  return x     = ID x
-  (ID x) >>= f = f x
-  _ >> y       = y
-  fail s       = panic s
-
-runID :: ID a -> a
-runID (ID x) = x
-
--------------------------------------------------------------------------------
 -- MTL
 -------------------------------------------------------------------------------
 
diff --git a/compiler/utils/OrdList.lhs b/compiler/utils/OrdList.lhs
index a56cdf3f58..7e3b24a5da 100644
--- a/compiler/utils/OrdList.lhs
+++ b/compiler/utils/OrdList.lhs
@@ -27,11 +27,14 @@ infixl 5  `snocOL`
 infixr 5  `consOL`
 
 data OrdList a
-  = Many [a]        -- Invariant: non-empty
+  = None
+  | One a
+  | Many [a]          -- Invariant: non-empty
+  | Cons a (OrdList a)
+  | Snoc (OrdList a) a
   | Two (OrdList a) -- Invariant: non-empty
         (OrdList a) -- Invariant: non-empty
-  | One  a
-  | None
+
 
 nilOL    :: OrdList a
 isNilOL  :: OrdList a -> Bool
@@ -44,22 +47,33 @@ concatOL :: [OrdList a] -> OrdList a
 
 nilOL        = None
 unitOL as    = One as
-snocOL None b    = One b
-snocOL as   b    = Two as (One b)
-consOL a    None = One a
-consOL a    bs   = Two (One a) bs
+snocOL as   b    = Snoc as b
+consOL a    bs   = Cons a bs
 concatOL aas = foldr appOL None aas
 
 isNilOL None = True
 isNilOL _    = False
 
-appOL None bs   = bs
-appOL as   None = as
-appOL as   bs   = Two as bs
+None  `appOL` b     = b
+a     `appOL` None  = a
+One a `appOL` b     = Cons a b
+a     `appOL` One b = Snoc a b
+a     `appOL` b     = Two a b
+
+fromOL :: OrdList a -> [a]
+fromOL a = go a []
+  where go None       acc = acc
+        go (One a)    acc = a : acc
+        go (Cons a b) acc = a : go b acc
+        go (Snoc a b) acc = go a (b:acc)
+        go (Two a b)  acc = go a (go b acc)
+        go (Many xs)  acc = xs ++ acc
 
 mapOL :: (a -> b) -> OrdList a -> OrdList b
 mapOL _ None = None
 mapOL f (One x) = One (f x)
+mapOL f (Cons x xs) = Cons (f x) (mapOL f xs)
+mapOL f (Snoc xs x) = Snoc (mapOL f xs) (f x)
 mapOL f (Two x y) = Two (mapOL f x) (mapOL f y)
 mapOL f (Many xs) = Many (map f xs)
 
@@ -69,24 +83,19 @@ instance Functor OrdList where
 foldrOL :: (a->b->b) -> b -> OrdList a -> b
 foldrOL _ z None        = z
 foldrOL k z (One x)     = k x z
+foldrOL k z (Cons x xs) = k x (foldrOL k z xs)
+foldrOL k z (Snoc xs x) = foldrOL k (k x z) xs
 foldrOL k z (Two b1 b2) = foldrOL k (foldrOL k z b2) b1
 foldrOL k z (Many xs)   = foldr k z xs
 
 foldlOL :: (b->a->b) -> b -> OrdList a -> b
 foldlOL _ z None        = z
 foldlOL k z (One x)     = k z x
+foldlOL k z (Cons x xs) = foldlOL k (k z x) xs
+foldlOL k z (Snoc xs x) = k (foldlOL k z xs) x
 foldlOL k z (Two b1 b2) = foldlOL k (foldlOL k z b1) b2
 foldlOL k z (Many xs)   = foldl k z xs
 
-fromOL :: OrdList a -> [a]
-fromOL ol 
-   = flat ol []
-     where
-        flat None      rest = rest
-        flat (One x)   rest = x:rest
-        flat (Two a b) rest = flat a (flat b rest)
-        flat (Many xs) rest = xs ++ rest
-
 toOL :: [a] -> OrdList a
 toOL [] = None
 toOL xs = Many xs
diff --git a/compiler/utils/Stream.hs b/compiler/utils/Stream.hs
new file mode 100644
index 0000000000..2fa76d2345
--- /dev/null
+++ b/compiler/utils/Stream.hs
@@ -0,0 +1,97 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2012
+--
+-- Monadic streams
+--
+-- -----------------------------------------------------------------------------
+
+module Stream (
+    Stream(..), yield, liftIO,
+    collect, fromList,
+    Stream.map, Stream.mapM, Stream.mapAccumL
+  ) where
+
+-- |
+-- @Stream m a b@ is a computation in some Monad @m@ that delivers a sequence
+-- of elements of type @a@ followed by a result of type @b@.
+--
+-- More concretely, a value of type @Stream m a b@ can be run using @runStream@
+-- in the Monad @m@, and it delivers either
+--
+--  * the final result: @Left b@, or
+--  * @Right (a,str)@, where @a@ is the next element in the stream, and @str@
+--    is a computation to get the rest of the stream.
+--
+-- Stream is itself a Monad, and provides an operation 'yield' that
+-- produces a new element of the stream.  This makes it convenient to turn
+-- existing monadic computations into streams.
+--
+-- The idea is that Stream is useful for making a monadic computation
+-- that produces values from time to time.  This can be used for
+-- knitting together two complex monadic operations, so that the
+-- producer does not have to produce all its values before the
+-- consumer starts consuming them.  We make the producer into a
+-- Stream, and the consumer pulls on the stream each time it wants a
+-- new value.
+--
+newtype Stream m a b = Stream { runStream :: m (Either b (a, Stream m a b)) }
+
+instance Monad m => Monad (Stream m a) where
+  return a = Stream (return (Left a))
+
+  Stream m >>= k = Stream $ do
+                r <- m
+                case r of
+                  Left b        -> runStream (k b)
+                  Right (a,str) -> return (Right (a, str >>= k))
+
+yield :: Monad m => a -> Stream m a ()
+yield a = Stream (return (Right (a, return ())))
+
+liftIO :: IO a -> Stream IO b a
+liftIO io = Stream $ io >>= return . Left
+
+-- | Turn a Stream into an ordinary list, by demanding all the elements.
+collect :: Monad m => Stream m a () -> m [a]
+collect str = go str []
+ where
+  go str acc = do
+    r <- runStream str
+    case r of
+      Left () -> return (reverse acc)
+      Right (a, str') -> go str' (a:acc)
+
+-- | Turn a list into a 'Stream', by yielding each element in turn.
+fromList :: Monad m => [a] -> Stream m a ()
+fromList = mapM_ yield
+
+-- | Apply a function to each element of a 'Stream', lazilly
+map :: Monad m => (a -> b) -> Stream m a x -> Stream m b x
+map f str = Stream $ do
+   r <- runStream str
+   case r of
+     Left x -> return (Left x)
+     Right (a, str') -> return (Right (f a, Stream.map f str'))
+
+-- | Apply a monadic operation to each element of a 'Stream', lazilly
+mapM :: Monad m => (a -> m b) -> Stream m a x -> Stream m b x
+mapM f str = Stream $ do
+   r <- runStream str
+   case r of
+     Left x -> return (Left x)
+     Right (a, str') -> do
+        b <- f a
+        return (Right (b, Stream.mapM f str'))
+
+-- | analog of the list-based 'mapAccumL' on Streams.  This is a simple
+-- way to map over a Stream while carrying some state around.
+mapAccumL :: Monad m => (c -> a -> m (c,b)) -> c -> Stream m a ()
+          -> Stream m b c
+mapAccumL f c str = Stream $ do
+  r <- runStream str
+  case r of
+    Left  () -> return (Left c)
+    Right (a, str') -> do
+      (c',b) <- f c a
+      return (Right (b, mapAccumL f c' str'))
diff --git a/compiler/utils/Util.lhs b/compiler/utils/Util.lhs
index d87f526bc8..9d12946052 100644
--- a/compiler/utils/Util.lhs
+++ b/compiler/utils/Util.lhs
@@ -46,7 +46,7 @@ module Util (
         nTimes,
 
         -- * Sorting
-        sortLe, sortWith, minWith, on,
+        sortWith, minWith,
 
         -- * Comparisons
         isEqual, eqListBy, eqMaybeBy,
@@ -92,7 +92,10 @@ module Util (
         abstractConstr, abstractDataType, mkNoRepType,
 
         -- * Utils for printing C code
-        charToC
+        charToC,
+
+        -- * Hashing
+        hashString,
     ) where
 
 #include "HsVersions.h"
@@ -115,6 +118,7 @@ import System.Directory ( doesDirectoryExist, getModificationTime )
 import System.FilePath
 
 import Data.Char        ( isUpper, isAlphaNum, isSpace, chr, ord, isDigit )
+import Data.Int
 import Data.Ratio       ( (%) )
 import Data.Ord         ( comparing )
 import Data.Bits
@@ -310,12 +314,7 @@ zipWith4Equal msg _ _  _  _  _  =  panic ("zipWith4Equal: unequal lists:"++msg)
 -- | 'zipLazy' is a kind of 'zip' that is lazy in the second list (observe the ~)
 zipLazy :: [a] -> [b] -> [(a,b)]
 zipLazy []     _       = []
--- We want to write this, but with GHC 6.4 we get a warning, so it
--- doesn't validate:
--- zipLazy (x:xs) ~(y:ys) = (x,y) : zipLazy xs ys
--- so we write this instead:
-zipLazy (x:xs) zs = let y : ys = zs
-                    in (x,y) : zipLazy xs ys
+zipLazy (x:xs) ~(y:ys) = (x,y) : zipLazy xs ys
 \end{code}
 
 
@@ -477,114 +476,17 @@ isn'tIn msg x ys
 
 %************************************************************************
 %*                                                                      *
-\subsubsection[Utils-Carsten-mergesort]{A mergesort from Carsten}
+\subsubsection{Sort utils}
 %*                                                                      *
 %************************************************************************
 
-\begin{display}
-Date: Mon, 3 May 93 20:45:23 +0200
-From: Carsten Kehler Holst <kehler@cs.chalmers.se>
-To: partain@dcs.gla.ac.uk
-Subject: natural merge sort beats quick sort [ and it is prettier ]
-
-Here is a piece of Haskell code that I'm rather fond of. See it as an
-attempt to get rid of the ridiculous quick-sort routine. groupUpdown is
-quite useful by itself I think it was John's idea originally though I
-believe the lazy version is due to me [surprisingly complicated].
-gamma [used to be called] is called gamma because I got inspired by
-the Gamma calculus. It is not very close to the calculus but does
-behave less sequentially than both foldr and foldl. One could imagine
-a version of gamma that took a unit element as well thereby avoiding
-the problem with empty lists.
-
-I've tried this code against
-
-   1) insertion sort - as provided by haskell
-   2) the normal implementation of quick sort
-   3) a deforested version of quick sort due to Jan Sparud
-   4) a super-optimized-quick-sort of Lennart's
-
-If the list is partially sorted both merge sort and in particular
-natural merge sort wins. If the list is random [ average length of
-rising subsequences = approx 2 ] mergesort still wins and natural
-merge sort is marginally beaten by Lennart's soqs. The space
-consumption of merge sort is a bit worse than Lennart's quick sort
-approx a factor of 2. And a lot worse if Sparud's bug-fix [see his
-fpca article ] isn't used because of groupUpdown.
-
-have fun
-Carsten
-\end{display}
-
 \begin{code}
-groupUpdown :: (a -> a -> Bool) -> [a] -> [[a]]
--- Given a <= function, groupUpdown finds maximal contiguous up-runs
--- or down-runs in the input list.
--- It's stable, in the sense that it never re-orders equal elements
---
--- Date: Mon, 12 Feb 1996 15:09:41 +0000
--- From: Andy Gill <andy@dcs.gla.ac.uk>
--- Here is a `better' definition of groupUpdown.
-
-groupUpdown _ []     = []
-groupUpdown p (x:xs) = group' xs x x (x :)
-  where
-    group' []     _     _     s  = [s []]
-    group' (x:xs) x_min x_max s
-        |      x_max `p` x  = group' xs x_min x     (s . (x :))
-        | not (x_min `p` x) = group' xs x     x_max ((x :) . s)
-        | otherwise         = s [] : group' xs x x (x :)
-        -- NB: the 'not' is essential for stablity
-        --     x `p` x_min would reverse equal elements
-
-generalMerge :: (a -> a -> Bool) -> [a] -> [a] -> [a]
-generalMerge _ xs [] = xs
-generalMerge _ [] ys = ys
-generalMerge p (x:xs) (y:ys) | x `p` y   = x : generalMerge p xs     (y:ys)
-                             | otherwise = y : generalMerge p (x:xs) ys
-
--- gamma is now called balancedFold
-
-balancedFold :: (a -> a -> a) -> [a] -> a
-balancedFold _ [] = error "can't reduce an empty list using balancedFold"
-balancedFold _ [x] = x
-balancedFold f l  = balancedFold f (balancedFold' f l)
-
-balancedFold' :: (a -> a -> a) -> [a] -> [a]
-balancedFold' f (x:y:xs) = f x y : balancedFold' f xs
-balancedFold' _ xs = xs
-
-generalNaturalMergeSort :: (a -> a -> Bool) -> [a] -> [a]
-generalNaturalMergeSort _ [] = []
-generalNaturalMergeSort p xs = (balancedFold (generalMerge p) . groupUpdown p) xs
-
-#if NOT_USED
-generalMergeSort p [] = []
-generalMergeSort p xs = (balancedFold (generalMerge p) . map (: [])) xs
-
-mergeSort, naturalMergeSort :: Ord a => [a] -> [a]
-
-mergeSort = generalMergeSort (<=)
-naturalMergeSort = generalNaturalMergeSort (<=)
-
-mergeSortLe le = generalMergeSort le
-#endif
-
-sortLe :: (a->a->Bool) -> [a] -> [a]
-sortLe le = generalNaturalMergeSort le
-
 sortWith :: Ord b => (a->b) -> [a] -> [a]
-sortWith get_key xs = sortLe le xs
-  where
-    x `le` y = get_key x < get_key y
+sortWith get_key xs = sortBy (comparing get_key) xs
 
 minWith :: Ord b => (a -> b) -> [a] -> a
 minWith get_key xs = ASSERT( not (null xs) )
                      head (sortWith get_key xs)
-
-on :: (a -> a -> c) -> (b -> a) -> b -> b -> c
-on cmp sel = \x y -> sel x `cmp` sel y
-
 \end{code}
 
 %************************************************************************
@@ -1151,3 +1053,70 @@ charToC w =
                          chr (ord '0' + ord c         `mod` 8)]
 \end{code}
 
+%************************************************************************
+%*                                                                      *
+\subsection[Utils-Hashing]{Utils for hashing}
+%*                                                                      *
+%************************************************************************
+
+\begin{code}
+-- | A sample hash function for Strings.  We keep multiplying by the
+-- golden ratio and adding.  The implementation is:
+--
+-- > hashString = foldl' f golden
+-- >   where f m c = fromIntegral (ord c) * magic + hashInt32 m
+-- >         magic = 0xdeadbeef
+--
+-- Where hashInt32 works just as hashInt shown above.
+--
+-- Knuth argues that repeated multiplication by the golden ratio
+-- will minimize gaps in the hash space, and thus it's a good choice
+-- for combining together multiple keys to form one.
+--
+-- Here we know that individual characters c are often small, and this
+-- produces frequent collisions if we use ord c alone.  A
+-- particular problem are the shorter low ASCII and ISO-8859-1
+-- character strings.  We pre-multiply by a magic twiddle factor to
+-- obtain a good distribution.  In fact, given the following test:
+--
+-- > testp :: Int32 -> Int
+-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
+-- >   where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
+-- >         hs = foldl' f golden
+-- >         f m c = fromIntegral (ord c) * k + hashInt32 m
+-- >         n = 100000
+--
+-- We discover that testp magic = 0.
+hashString :: String -> Int32
+hashString = foldl' f golden
+   where f m c = fromIntegral (ord c) * magic + hashInt32 m
+         magic = 0xdeadbeef
+
+golden :: Int32
+golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
+-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
+-- but that has bad mulHi properties (even adding 2^32 to get its inverse)
+-- Whereas the above works well and contains no hash duplications for
+-- [-32767..65536]
+
+-- | A sample (and useful) hash function for Int32,
+-- implemented by extracting the uppermost 32 bits of the 64-bit
+-- result of multiplying by a 33-bit constant.  The constant is from
+-- Knuth, derived from the golden ratio:
+--
+-- > golden = round ((sqrt 5 - 1) * 2^32)
+--
+-- We get good key uniqueness on small inputs
+-- (a problem with previous versions):
+--  (length $ group $ sort $ map hashInt32 [-32767..65536]) == 65536 + 32768
+--
+hashInt32 :: Int32 -> Int32
+hashInt32 x = mulHi x golden + x
+
+-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
+mulHi :: Int32 -> Int32 -> Int32
+mulHi a b = fromIntegral (r `shiftR` 32)
+   where r :: Int64
+         r = fromIntegral a * fromIntegral b
+\end{code}
+
diff --git a/compiler/vectorise/Vectorise/Generic/PADict.hs b/compiler/vectorise/Vectorise/Generic/PADict.hs
index d73bea17ee..20aab59182 100644
--- a/compiler/vectorise/Vectorise/Generic/PADict.hs
+++ b/compiler/vectorise/Vectorise/Generic/PADict.hs
@@ -19,6 +19,7 @@ import Type
 import Id
 import Var
 import Name
+import FastString
 
 
 -- |Build the PA dictionary function for some type and hoist it to top level.
@@ -26,15 +27,15 @@ import Name
 --  The PA dictionary holds fns that convert values to and from their vectorised representations.
 --
 -- @Recall the definition:
---    class class PR (PRepr a) => PA a where
+--    class PR (PRepr a) => PA a where
 --      toPRepr      :: a -> PRepr a
 --      fromPRepr    :: PRepr a -> a
 --      toArrPRepr   :: PData a -> PData (PRepr a)
 --      fromArrPRepr :: PData (PRepr a) -> PData a
 --
 -- Example:
---    df :: forall a. PA a -> PA (T a)
---    df = /\a. \(d:PA a). MkPA ($PR_df a d) ($toPRepr a d) ... 
+--    df :: forall a. PR (PRepr a) -> PA a -> PA (T a)
+--    df = /\a. \(c:PR (PRepr a)) (d:PA a). MkPA c ($PR_df a d) ($toPRepr a d) ... 
 --    $dPR_df :: forall a. PA a -> PR (PRepr (T a))
 --    $dPR_df = ....   
 --    $toRepr :: forall a. PA a -> T a -> PRepr (T a)
@@ -52,34 +53,48 @@ buildPADict
         -> VM Var       -- ^ name of the top-level dictionary function.
 
 buildPADict vect_tc prepr_ax pdata_tc pdatas_tc repr
- = polyAbstract tvs $ \args ->    -- The args are the dictionaries we lambda
-                                  -- abstract over; and they are put in the
-                                  -- envt, so when we need a (PA a) we can 
-                                  -- find it in the envt
+ = polyAbstract tvs $ \args ->    -- The args are the dictionaries we lambda abstract over; and they
+                                  -- are put in the envt, so when we need a (PA a) we can find it in
+                                  -- the envt; they don't include the silent superclass args yet
    do { mod <- liftDs getModuleDs
       ; let dfun_name = mkLocalisedOccName mod mkPADFunOcc vect_tc_name
-      
+   
+          -- The superclass dictionary is a (silent) argument if the tycon is polymorphic...
+      ; let mk_super_ty = do { r <- mkPReprType inst_ty
+                             ; pr_cls <- builtin prClass
+                             ; return $ mkClassPred pr_cls [r]
+                             }
+      ; super_tys <- sequence [mk_super_ty | not (null tvs)]
+      ; super_args <- mapM (newLocalVar (fsLit "pr")) super_tys
+      ; let all_args = super_args ++ args
+     
+          -- ...it is constant otherwise
+      ; super_consts <- sequence [prDictOfPReprInstTyCon inst_ty prepr_ax [] | null tvs]
+
           -- Get ids for each of the methods in the dictionary, including superclass
       ; paMethodBuilders <- buildPAScAndMethods
-      ; method_ids       <- mapM (method args dfun_name) paMethodBuilders
+      ; method_ids       <- mapM (method all_args dfun_name) paMethodBuilders
 
           -- Expression to build the dictionary.
       ; pa_dc  <- builtin paDataCon
-      ; let dict = mkLams (tvs ++ args)
+      ; let dict = mkLams (tvs ++ all_args)
                  $ mkConApp pa_dc
                  $ Type inst_ty
-                   : map (method_call args) method_ids
+                 : map Var super_args ++ super_consts -- the superclass dictionary is either lambda-bound or constant
+                   ++ map (method_call all_args) method_ids
 
           -- Build the type of the dictionary function.
       ; pa_cls <- builtin paClass
       ; let dfun_ty = mkForAllTys tvs
-                    $ mkFunTys (map varType args)
+                    $ mkFunTys (map varType all_args)
                                (mkClassPred pa_cls [inst_ty])
 
           -- Set the unfolding for the inliner.
       ; raw_dfun <- newExportedVar dfun_name dfun_ty
       ; let dfun_unf = mkDFunUnfolding dfun_ty $
-                       map Var method_ids
+                       map (const $ DFunLamArg 0) super_args
+                       -- ++ map DFunConstArg super_consts
+                       ++ map (DFunPolyArg . Var) method_ids
             dfun = raw_dfun `setIdUnfolding`  dfun_unf
                             `setInlinePragma` dfunInlinePragma
 
diff --git a/compiler/vectorise/Vectorise/Generic/PAMethods.hs b/compiler/vectorise/Vectorise/Generic/PAMethods.hs
index 77793295dd..61c07cd299 100644
--- a/compiler/vectorise/Vectorise/Generic/PAMethods.hs
+++ b/compiler/vectorise/Vectorise/Generic/PAMethods.hs
@@ -68,8 +68,7 @@ type PAInstanceBuilder
 
 buildPAScAndMethods :: VM [(String, PAInstanceBuilder)]
 buildPAScAndMethods
- = return [ ("PR",            buildPRDict)
-          , ("toPRepr",       buildToPRepr)
+ = return [ ("toPRepr",       buildToPRepr)
           , ("fromPRepr",     buildFromPRepr)
           , ("toArrPRepr",    buildToArrPRepr)
           , ("fromArrPRepr",  buildFromArrPRepr)
@@ -77,14 +76,6 @@ buildPAScAndMethods
           , ("fromArrPReprs", buildFromArrPReprs)]
 
 
-buildPRDict :: PAInstanceBuilder
-buildPRDict vect_tc prepr_ax _ _ _
-  = prDictOfPReprInstTyCon inst_ty prepr_ax arg_tys
-  where
-    arg_tys = mkTyVarTys (tyConTyVars vect_tc)
-    inst_ty = mkTyConApp vect_tc arg_tys
-
-
 -- buildToPRepr ---------------------------------------------------------------
 -- | Build the 'toRepr' method of the PA class.
 buildToPRepr :: PAInstanceBuilder
diff --git a/compiler/vectorise/Vectorise/Monad/InstEnv.hs b/compiler/vectorise/Vectorise/Monad/InstEnv.hs
index 34d3d75b75..fc12ee567c 100644
--- a/compiler/vectorise/Vectorise/Monad/InstEnv.hs
+++ b/compiler/vectorise/Vectorise/Monad/InstEnv.hs
@@ -40,13 +40,13 @@ lookupInst cls tys
                   cantVectorise dflags "Vectorise.Monad.InstEnv.lookupInst:" err
        }
 
--- Look up the representation tycon of a family instance.
+-- Look up a family instance.
 --
 -- The match must be unique - ie, match exactly one instance - but the 
 -- type arguments used for matching may be more specific than those of 
 -- the family instance declaration.
 --
--- Return the instance tycon and its type instance.  For example, if we have
+-- Return the family instance and its type instance.  For example, if we have
 --
 --  lookupFamInst 'T' '[Int]' yields (':R42T', 'Int')
 --
@@ -56,13 +56,12 @@ lookupInst cls tys
 --
 -- which implies that :R42T was declared as 'data instance T [a]'.
 --
-lookupFamInst :: TyCon -> [Type] -> VM (TyCon, [Type])
+lookupFamInst :: TyCon -> [Type] -> VM (FamInst, [Type])
 lookupFamInst tycon tys
   = ASSERT( isFamilyTyCon tycon )
     do { instEnv <- readGEnv global_fam_inst_env
        ; case lookupFamInstEnv instEnv tycon tys of
-           [(fam_inst, rep_tys)] -> return ( dataFamInstRepTyCon fam_inst
-                                           , rep_tys)
+           [(fam_inst, rep_tys)] -> return ( fam_inst, rep_tys)
            _other                -> 
              do dflags <- getDynFlags
                 cantVectorise dflags "VectMonad.lookupFamInst: not found: "
diff --git a/compiler/vectorise/Vectorise/Utils/Base.hs b/compiler/vectorise/Vectorise/Utils/Base.hs
index 2b47ddfb9b..9ed4e2c60e 100644
--- a/compiler/vectorise/Vectorise/Utils/Base.hs
+++ b/compiler/vectorise/Vectorise/Utils/Base.hs
@@ -21,6 +21,8 @@ module Vectorise.Utils.Base
   , pdataReprTyConExact
   , pdatasReprTyConExact
   , pdataUnwrapScrut
+  
+  , preprSynTyCon
 ) where
 
 import Vectorise.Monad
@@ -29,6 +31,7 @@ import Vectorise.Builtins
 
 import CoreSyn
 import CoreUtils
+import FamInstEnv
 import Coercion
 import Type
 import TyCon
@@ -200,7 +203,11 @@ unwrapNewTypeBodyOfPDatasWrap e ty
 -- a set of distinct type variables.
 -- 
 pdataReprTyCon :: Type -> VM (TyCon, [Type])
-pdataReprTyCon ty = builtin pdataTyCon >>= (`lookupFamInst` [ty])
+pdataReprTyCon ty 
+  = do 
+    { (famInst, tys) <- builtin pdataTyCon >>= (`lookupFamInst` [ty])
+    ; return (dataFamInstRepTyCon famInst, tys)
+    }
 
 -- |Get the representation tycon of the 'PData' data family for a given type constructor.
 --
@@ -225,7 +232,7 @@ pdatasReprTyConExact tycon
   = do {   -- look up the representation tycon; if there is a match at all, it will be be exact
        ;   -- (i.e.,' _tys' will be distinct type variables)
        ; (ptycon, _tys) <- pdatasReprTyCon (tycon `mkTyConApp` mkTyVarTys (tyConTyVars tycon))
-       ; return ptycon
+       ; return $ dataFamInstRepTyCon ptycon
        }
   where
     pdatasReprTyCon ty = builtin pdatasTyCon >>= (`lookupFamInst` [ty])
@@ -240,3 +247,11 @@ pdataUnwrapScrut (ve, le)
        }
   where
     ty = exprType ve
+
+
+-- 'PRepr' representation types ----------------------------------------------
+
+-- |Get the representation tycon of the 'PRepr' type family for a given type.
+--
+preprSynTyCon :: Type -> VM (FamInst, [Type])
+preprSynTyCon ty = builtin preprTyCon >>= (`lookupFamInst` [ty])
diff --git a/compiler/vectorise/Vectorise/Utils/PADict.hs b/compiler/vectorise/Vectorise/Utils/PADict.hs
index de80127c44..85060c477c 100644
--- a/compiler/vectorise/Vectorise/Utils/PADict.hs
+++ b/compiler/vectorise/Vectorise/Utils/PADict.hs
@@ -12,6 +12,7 @@ import Vectorise.Utils.Base
 
 import CoreSyn
 import CoreUtils
+import FamInstEnv
 import Coercion
 import Type
 import TypeRep
@@ -66,25 +67,35 @@ paDictOfType ty
     -- for type variables, look up the dfun and apply to the PA dictionaries
     -- of the type arguments
     paDictOfTyApp (TyVarTy tv) ty_args
-     = do dfun <- maybeCantVectoriseM "No PA dictionary for type variable"
+      = do 
+        { dfun <- maybeCantVectoriseM "No PA dictionary for type variable"
                                       (ppr tv <+> text "in" <+> ppr ty)
                 $ lookupTyVarPA tv
-          dicts <- mapM paDictOfType ty_args
-          return $ dfun `mkTyApps` ty_args `mkApps` dicts
+        ; dicts <- mapM paDictOfType ty_args
+        ; return $ dfun `mkTyApps` ty_args `mkApps` dicts
+        }
 
     -- for tycons, we also need to apply the dfun to the PR dictionary of
     -- the representation type if the tycon is polymorphic
     paDictOfTyApp (TyConApp tc []) ty_args
-     = do
-         dfun <- maybeCantVectoriseM noPADictErr (ppr tc <+> text "in" <+> ppr ty)
+      = do 
+        { dfun <- maybeCantVectoriseM noPADictErr (ppr tc <+> text "in" <+> ppr ty)
                 $ lookupTyConPA tc
-         dicts <- mapM paDictOfType ty_args
-         return $ Var dfun `mkTyApps` ty_args `mkApps` dicts
-     where
-       noPADictErr = "No PA dictionary for type constructor (did you import 'Data.Array.Parallel'?)"
-
-    paDictOfTyApp _ _ = do dflags <- getDynFlags
-                           failure dflags
+        ; super <- super_dict tc ty_args
+        ; dicts <- mapM paDictOfType ty_args
+        ; return $ Var dfun `mkTyApps` ty_args `mkApps` super `mkApps` dicts
+        }
+      where
+        noPADictErr = "No PA dictionary for type constructor (did you import 'Data.Array.Parallel'?)"
+
+        super_dict _     []      = return []
+        super_dict tycon ty_args
+          = do
+            { pr <- prDictOfPReprInst (TyConApp tycon ty_args)
+            ; return [pr]
+            }
+    
+    paDictOfTyApp _ _ = getDynFlags >>= failure
 
     failure dflags = cantVectorise dflags "Can't construct PA dictionary for type" (ppr ty)
 
@@ -96,11 +107,21 @@ paMethod _ query ty
   = liftM Var $ builtin (query tycon)
 paMethod method _ ty
   = do
-      fn   <- builtin method
-      dict <- paDictOfType ty
-      return $ mkApps (Var fn) [Type ty, dict]
+    { fn   <- builtin method
+    ; dict <- paDictOfType ty
+    ; return $ mkApps (Var fn) [Type ty, dict]
+    }
+
+-- |Given a type @ty@, return the PR dictionary for @PRepr ty@.
+--
+prDictOfPReprInst :: Type -> VM CoreExpr
+prDictOfPReprInst ty
+  = do
+    { (prepr_fam, prepr_args) <- preprSynTyCon ty
+    ; prDictOfPReprInstTyCon ty (famInstAxiom prepr_fam) prepr_args
+    }
 
--- | Given a type @ty@, its PRepr synonym tycon and its type arguments,
+-- |Given a type @ty@, its PRepr synonym tycon and its type arguments,
 -- return the PR @PRepr ty@. Suppose we have:
 --
 -- > type instance PRepr (T a1 ... an) = t
diff --git a/configure.ac b/configure.ac
index e1b4db0d73..74191e4946 100644
--- a/configure.ac
+++ b/configure.ac
@@ -336,6 +336,7 @@ then
     CC="${mingwbin}gcc.exe"
     LD="${mingwbin}ld.exe"
     NM="${mingwbin}nm.exe"
+    OBJDUMP="${mingwbin}objdump.exe"
     fp_prog_ar="${mingwbin}ar.exe"
 
     if ! test -d inplace/perl ||
@@ -446,6 +447,17 @@ FP_ARG_WITH_PATH_GNU_PROG([NM], [nm], [nm])
 NmCmd="$NM"
 AC_SUBST([NmCmd])
 
+# Note: we may not have objdump on OS X, and we only need it on Windows (for DLL checks)
+case $HostOS_CPP in
+cygwin32|mingw32)
+    dnl ** Which objdump to use?
+    dnl --------------------------------------------------------------
+    FP_ARG_WITH_PATH_GNU_PROG([OBJDUMP], [objdump], [objdump])
+    ObjdumpCmd="$OBJDUMP"
+    AC_SUBST([ObjdumpCmd])
+    ;;
+esac
+
 dnl ** Which LLVM llc to use?
 dnl --------------------------------------------------------------
 FP_ARG_WITH_PATH_GNU_PROG_OPTIONAL([LLC], [llc])
diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml
index 3b4f36db6c..11e4f8f7d3 100644
--- a/docs/users_guide/flags.xml
+++ b/docs/users_guide/flags.xml
@@ -535,6 +535,13 @@
             <entry>dynamic</entry>
             <entry>-</entry>
           </row>
+          <row>
+            <entry><option>-interactive-print</option></entry>
+            <entry><link linkend="ghci-interactive-print">Select the function
+                    to use for printing evaluated expressions in GHCi</link></entry>
+            <entry>dynamic</entry>
+            <entry>-</entry>
+          </row>
 
         </tbody>
       </tgroup>
diff --git a/docs/users_guide/ghci.xml b/docs/users_guide/ghci.xml
index 3d629db9a6..5726a41baf 100644
--- a/docs/users_guide/ghci.xml
+++ b/docs/users_guide/ghci.xml
@@ -1090,6 +1090,61 @@ def = toEnum 0
     printf.
    </para>
     </sect2>
+   <sect2 id="ghci-interactive-print">
+     <title>Using a custom interactive printing function</title>
+     <para>[<emphasis role="bold">New in version 7.6.1</emphasis>]
+        By default, GHCi prints the result of expressions typed at the prompt
+        using the function <literal>System.IO.print</literal>. Its type
+        signature is <literal>Show a => a -> IO ()</literal>, and it works by
+        converting the value to <literal>String</literal> using
+        <literal>show</literal>.
+     </para>
+     <para>
+        This is not ideal in certain cases, like when the output is long, or
+        contains strings with non-ascii characters.
+     </para>
+     <para>
+       The <literal>-interactive-print</literal> flag allows to specify any
+       function of type <literal>C a => a -> IO ()</literal>, for some
+       constraint <literal>C</literal>, as the function for printing evaluated
+       expressions. The function can reside in any loaded module or any
+       registered package.
+     </para>
+     <para>
+       As an example, suppose we have following special printing module:
+       <programlisting>
+	 module SpecPrinter where
+	 import System.IO
+
+	 sprint a = putStrLn $ show a ++ "!"
+       </programlisting>
+       The <literal>sprint</literal> function adds an exclamation mark at the
+       end of any printed value. Running GHCi with the command:
+       <programlisting>
+	 ghci -interactive-print=SpecPrinter.sprinter SpecPrinter
+       </programlisting>
+       will start an interactive session where values with be printed using
+       <literal>sprint</literal>:
+       <programlisting>
+	 *SpecPrinter> [1,2,3]
+	 [1,2,3]!
+	 *SpecPrinter> 42
+	 42!
+       </programlisting>
+     </para>
+     <para>
+       A custom pretty printing function can be used, for example, to format
+       tree-like and nested structures in a more readable way.
+     </para>
+     <para>
+       The <literal>-interactive-print</literal> flag can also be used when
+       running GHC in <literal>-e mode</literal>:
+       <programlisting>
+	 % ghc -e "[1,2,3]" -interactive-print=SpecPrinter.sprint SpecPrinter
+	 [1,2,3]!
+       </programlisting>
+     </para>
+   </sect2>
   </sect1>
 
   <sect1 id="ghci-debugger">
diff --git a/docs/users_guide/glasgow_exts.xml b/docs/users_guide/glasgow_exts.xml
index c941df1b6e..df1ff2c181 100644
--- a/docs/users_guide/glasgow_exts.xml
+++ b/docs/users_guide/glasgow_exts.xml
@@ -3339,8 +3339,9 @@ then writing the data type instance by hand.
 </listitem>
 
 <listitem><para> With <option>-XDeriveGeneric</option>, you can derive
-instances of  the class <literal>Generic</literal>, defined in
-<literal>GHC.Generics</literal>. You can use these to define generic functions,
+instances of the classes <literal>Generic</literal> and
+<literal>Generic1</literal>, defined in <literal>GHC.Generics</literal>.
+You can use these to define generic functions,
 as described in <xref linkend="generic-programming"/>.
 </para></listitem>
 
@@ -9884,8 +9885,9 @@ data (:*:) f g p = f p :*: g p
 </para>
 
 <para>
-The <literal>Generic</literal> class mediates between user-defined datatypes
-and their internal representation as a sum-of-products:
+The <literal>Generic</literal> and <literal>Generic1</literal> classes mediate
+between user-defined datatypes and their internal representation as a
+sum-of-products:
 
 <programlisting>
 class Generic a where
@@ -9895,9 +9897,17 @@ class Generic a where
   from  :: a -> (Rep a) x
   -- Convert from the representation to the datatype
   to    :: (Rep a) x -> a
+
+class Generic1 f where
+  type Rep1 f :: * -> *
+
+  from1  :: f a -> Rep1 f a
+  to1    :: Rep1 f a -> f a
 </programlisting>
 
-Instances of this class can be derived by GHC with the
+<literal>Generic1</literal> is used for functions that can only be defined over
+type containers, such as <literal>map</literal>.
+Instances of these classes can be derived by GHC with the
 <option>-XDeriveGeneric</option> (<xref linkend="deriving-typeable"/>), and are
 necessary to be able to define generic instances automatically.
 </para>
@@ -9912,7 +9922,7 @@ instance Generic (UserTree a) where
   type Rep (UserTree a) =
     M1 D D1UserTree (
           M1 C C1_0UserTree (
-                M1 S NoSelector (K1 P a)
+                M1 S NoSelector (K1 R a)
             :*: M1 S NoSelector (K1 R (UserTree a))
             :*: M1 S NoSelector (K1 R (UserTree a)))
       :+: M1 C C1_1UserTree U1)
@@ -10007,17 +10017,20 @@ instance (Serialize a) => Serialize (UserTree a)
 
 The default method for <literal>put</literal> is then used, corresponding to the
 generic implementation of serialization.
+
+For more examples of generic functions please refer to the
+<ulink url="http://hackage.haskell.org/package/generic-deriving">generic-deriving</ulink>
+package on Hackage.
 </para>
 </sect2>
 
-
 <sect2>
 <title>More information</title>
 
 <para>
-For more detail please refer to the
-<ulink url="http://www.haskell.org/haskellwiki/Generics">HaskellWiki page</ulink>
-or the original paper:
+For more details please refer to the
+<ulink url="http://www.haskell.org/haskellwiki/GHC.Generics">HaskellWiki
+page</ulink> or the original paper:
 </para>
 
 <itemizedlist>
@@ -10032,12 +10045,6 @@ Jose Pedro Magalhaes, Atze Dijkstra, Johan Jeuring, and Andres Loeh.
 </listitem>
 </itemizedlist>
 
-<emphasis>Note</emphasis>: the current support for generic programming in GHC
-is preliminary. In particular, we only allow deriving instances for the
-<literal>Generic</literal> class. Support for deriving
-<literal>Generic1</literal> (and thus enabling generic functions of kind
-<literal>* -> *</literal> such as <literal>fmap</literal>) will come at a
-later stage.
 </sect2>
 
 </sect1>
diff --git a/ghc.mk b/ghc.mk
index 903cf1f52b..fc7d44b59e 100644
--- a/ghc.mk
+++ b/ghc.mk
@@ -528,6 +528,18 @@ $(error Unknown integer library: $(INTEGER_LIBRARY))
 endif
 endif
 
+# ----------------------------------------
+# Workarounds for problems building DLLs on Windows
+
+ifeq "$(TargetOS_CPP)" "mingw32"
+# We don't build the GHC package the dyn way on Windows, so
+# we can't build these packages the dyn way either. See trac #5987
+libraries/dph/dph-lifted-base_dist-install_EXCLUDED_WAYS := dyn
+libraries/dph/dph-lifted-boxed_dist-install_EXCLUDED_WAYS := dyn
+libraries/dph/dph-lifted-copy_dist-install_EXCLUDED_WAYS := dyn
+libraries/dph/dph-lifted-vseg_dist-install_EXCLUDED_WAYS := dyn
+endif
+
 # ----------------------------------------------
 # Checking packages with 'cabal check'
 
diff --git a/ghc/GhciTags.hs b/ghc/GhciTags.hs
index e838269df8..1f43328f8d 100644
--- a/ghc/GhciTags.hs
+++ b/ghc/GhciTags.hs
@@ -17,7 +17,6 @@ import Exception
 import GHC
 import GhciMonad
 import Outputable
-import Util
 
 -- ToDo: figure out whether we need these, and put something appropriate
 -- into the GHC API instead
@@ -25,7 +24,9 @@ import Name (nameOccName)
 import OccName (pprOccName)
 import MonadUtils
 
+import Data.Function
 import Data.Maybe
+import Data.Ord
 import Panic
 import Data.List
 import Control.Monad
@@ -132,13 +133,13 @@ tagInfo dflags unqual exported kind name loc
 collateAndWriteTags :: TagsKind -> FilePath -> [TagInfo] -> IO (Either IOError ())
 -- ctags style with the Ex exresion being just the line number, Vim et al
 collateAndWriteTags CTagsWithLineNumbers file tagInfos = do
-  let tags = unlines $ sortLe (<=) $ map showCTag tagInfos
+  let tags = unlines $ sort $ map showCTag tagInfos
   tryIO (writeFile file tags)
 
 -- ctags style with the Ex exresion being a regex searching the line, Vim et al
 collateAndWriteTags CTagsWithRegExes file tagInfos = do -- ctags style, Vim et al
   tagInfoGroups <- makeTagGroupsWithSrcInfo tagInfos
-  let tags = unlines $ sortLe (<=) $ map showCTag $concat tagInfoGroups
+  let tags = unlines $ sort $ map showCTag $concat tagInfoGroups
   tryIO (writeFile file tags)
 
 collateAndWriteTags ETags file tagInfos = do -- etags style, Emacs/XEmacs
@@ -155,16 +156,14 @@ collateAndWriteTags ETags file tagInfos = do -- etags style, Emacs/XEmacs
 
 makeTagGroupsWithSrcInfo :: [TagInfo] -> IO [[TagInfo]]
 makeTagGroupsWithSrcInfo tagInfos = do
-  let byFile op ti0 ti1 = tagFile ti0 `op` tagFile ti1
-      groups = groupBy (byFile (==)) $ sortLe (byFile (<=)) tagInfos
+  let groups = groupBy ((==) `on` tagFile) $ sortBy (comparing tagFile) tagInfos
   mapM addTagSrcInfo groups
 
   where
     addTagSrcInfo [] = ghcError (CmdLineError "empty tag file group??")
     addTagSrcInfo group@(tagInfo:_) = do
       file <- readFile $tagFile tagInfo
-      let byLine ti0 ti1 = tagLine ti0 <= tagLine ti1
-          sortedGroup = sortLe byLine group
+      let sortedGroup = sortBy (comparing tagLine) group
       return $ perFile sortedGroup 1 0 $ lines file
 
     perFile allTags@(tag:tags) cnt pos allLs@(l:ls)
diff --git a/ghc/InteractiveUI.hs b/ghc/InteractiveUI.hs
index c56f5067f9..1dc203d4ad 100644
--- a/ghc/InteractiveUI.hs
+++ b/ghc/InteractiveUI.hs
@@ -21,12 +21,14 @@ import Debugger
 
 -- The GHC interface
 import DynFlags
+import GhcMonad ( modifySession )
 import qualified GHC
 import GHC ( LoadHowMuch(..), Target(..),  TargetId(..), InteractiveImport(..),
              TyThing(..), Phase, BreakIndex, Resume, SingleStep, Ghc,
              handleSourceError )
 import HsImpExp
-import HscTypes ( tyThingParent_maybe, handleFlagWarnings, getSafeMode, dep_pkgs )
+import HscTypes ( tyThingParent_maybe, handleFlagWarnings, getSafeMode, dep_pkgs, hsc_IC, 
+                  setInteractivePrintName )
 import Module
 import Name
 import Packages ( trusted, getPackageDetails, exposed, exposedModules, pkgIdMap )
@@ -62,6 +64,7 @@ import Control.Monad.IO.Class
 import Data.Array
 import qualified Data.ByteString.Char8 as BS
 import Data.Char
+import Data.Function
 import Data.IORef ( IORef, readIORef, writeIORef )
 import Data.List ( find, group, intercalate, intersperse, isPrefixOf, nub,
                    partition, sort, sortBy )
@@ -206,7 +209,8 @@ helpText =
   "   :cmd <expr>                 run the commands returned by <expr>::IO String\n" ++
   "   :ctags[!] [<file>]          create tags file for Vi (default: \"tags\")\n" ++
   "                               (!: use regex instead of line number)\n" ++
-  "   :def <cmd> <expr>           define a command :<cmd>\n" ++
+  "   :def <cmd> <expr>           define command :<cmd> (later defined command has\n" ++
+  "                               precedence, ::<cmd> is always a builtin command)\n" ++
   "   :edit <file>                edit file\n" ++
   "   :edit                       edit last module\n" ++
   "   :etags [<file>]             create tags file for Emacs (default: \"TAGS\")\n" ++
@@ -449,6 +453,8 @@ runGHCi paths maybe_exprs = do
      when (isJust maybe_exprs && failed ok) $
         liftIO (exitWith (ExitFailure 1))
 
+  installInteractivePrint (interactivePrint dflags) (isJust maybe_exprs)
+
   -- if verbosity is greater than 0, or we are connected to a
   -- terminal, display the prompt in the interactive loop.
   is_tty <- liftIO (hIsTerminalDevice stdin)
@@ -606,6 +612,18 @@ queryQueue = do
     c:cs -> do setGHCiState st{ cmdqueue = cs }
                return (Just c)
 
+-- Reconfigurable pretty-printing Ticket #5461
+installInteractivePrint :: Maybe String -> Bool -> GHCi ()
+installInteractivePrint Nothing _  = return ()
+installInteractivePrint (Just ipFun) exprmode = do
+  ok <- trySuccess $ do
+                (name:_) <- GHC.parseName ipFun
+                modifySession (\he -> let new_ic = setInteractivePrintName (hsc_IC he) name 
+                                      in he{hsc_IC = new_ic})
+                return Succeeded
+
+  when (failed ok && exprmode) $ liftIO (exitWith (ExitFailure 1))
+
 -- | The main read-eval-print loop
 runCommands :: InputT GHCi (Maybe String) -> InputT GHCi ()
 runCommands = runCommands' handler
@@ -891,12 +909,9 @@ lookupCommand' ":" = return Nothing
 lookupCommand' str' = do
   macros <- readIORef macros_ref
   let{ (str, cmds) = case str' of
-      ':' : rest -> (rest, builtin_commands)
-      _ -> (str', builtin_commands ++ macros) }
+      ':' : rest -> (rest, builtin_commands) -- "::" selects a builtin command
+      _ -> (str', macros ++ builtin_commands) } -- otherwise prefer macros
   -- look for exact match first, then the first prefix match
-  -- We consider builtin commands first: since new macros are appended
-  -- on the *end* of the macros list, this is consistent with the view
-  -- that things defined earlier should take precedence. See also #3858
   return $ case [ c | c <- cmds, str == cmdName c ] of
            c:_ -> Just c
            [] -> case [ c | c@(s,_,_) <- cmds, str `isPrefixOf` s ] of
@@ -989,12 +1004,12 @@ filterOutChildren get_thing xs
 pprInfo :: PrintExplicitForalls -> (TyThing, Fixity, [GHC.ClsInst]) -> SDoc
 pprInfo pefas (thing, fixity, insts)
   =  pprTyThingInContextLoc pefas thing
-  $$ show_fixity fixity
+  $$ show_fixity
   $$ vcat (map GHC.pprInstance insts)
   where
-    show_fixity fix
-        | fix == GHC.defaultFixity = empty
-        | otherwise                = ppr fix <+> pprInfixName (GHC.getName thing)
+    show_fixity
+        | fixity == GHC.defaultFixity = empty
+        | otherwise                   = ppr fixity <+> pprInfixName (GHC.getName thing)
 
 -----------------------------------------------------------------------------
 -- :main
@@ -1125,8 +1140,8 @@ defineMacro overwrite s = do
   handleSourceError (\e -> GHC.printException e) $
    do
     hv <- GHC.compileExpr new_expr
-    liftIO (writeIORef macros_ref --
-            (filtered ++ [(macro_name, lift . runMacro hv, noCompletion)]))
+    liftIO (writeIORef macros_ref -- later defined macros have precedence
+            ((macro_name, lift . runMacro hv, noCompletion) : filtered))
 
 runMacro :: GHC.HValue{-String -> IO String-} -> String -> GHCi Bool
 runMacro fun s = do
@@ -1974,6 +1989,7 @@ newDynFlags interactive_only minus_opts = do
               packageFlags idflags1 /= packageFlags idflags0) $ do
           liftIO $ hPutStrLn stderr "cannot set package flags with :seti; use :set"
       GHC.setInteractiveDynFlags idflags1
+      installInteractivePrint (interactivePrint idflags1) False
 
       dflags0 <- getDynFlags
       when (not interactive_only) $ do
@@ -2151,11 +2167,11 @@ showBindings = do
     pprTT :: PrintExplicitForalls -> (TyThing, Fixity, [GHC.ClsInst]) -> SDoc
     pprTT pefas (thing, fixity, _insts) =
         pprTyThing pefas thing
-        $$ show_fixity fixity
+        $$ show_fixity
       where
-        show_fixity fix
-            | fix == GHC.defaultFixity  = empty
-            | otherwise                 = ppr fix <+> ppr (GHC.getName thing)
+        show_fixity
+            | fixity == GHC.defaultFixity  = empty
+            | otherwise                    = ppr fixity <+> ppr (GHC.getName thing)
 
 
 printTyThing :: TyThing -> GHCi ()
diff --git a/mk/config.mk.in b/mk/config.mk.in
index 36b277b979..81298e4a93 100644
--- a/mk/config.mk.in
+++ b/mk/config.mk.in
@@ -714,6 +714,7 @@ DTRACE			= @DtraceCmd@
 
 LD = @LdCmd@
 NM = @NmCmd@
+OBJDUMP = @ObjdumpCmd@
 
 LLC = @LlcCmd@
 OPT = @OptCmd@
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 4e156e63d0..7c86efadb7 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -474,7 +474,7 @@ void setupRtsFlags (int *argc, char *argv[],
     total_arg = *argc;
     arg = 1;
 
-    *argc = 1;
+    if (*argc > 1) { *argc = 1; };
     rts_argc = 0;
 
     rts_argv_size = total_arg + 1;
@@ -1677,16 +1677,22 @@ static void freeArgv(int argc, char *argv[])
 void
 setProgName(char *argv[])
 {
+    char *last_slash;
+
+    if (argv[0] == NULL) { // #7037
+        prog_name = "";
+        return;
+    }
+
     /* Remove directory from argv[0] -- default files in current directory */
 #if !defined(mingw32_HOST_OS)
-    char *last_slash;
     if ( (last_slash = (char *) strrchr(argv[0], '/')) != NULL ) {
 	prog_name = last_slash+1;
    } else {
 	prog_name = argv[0];
    }
 #else
-    char* last_slash = argv[0] + (strlen(argv[0]) - 1);
+    last_slash = argv[0] + (strlen(argv[0]) - 1);
     while ( last_slash > argv[0] ) {
 	if ( *last_slash == '/' || *last_slash == '\\' ) {
 	    prog_name = last_slash+1;
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 326565386a..5a8c35f0cb 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -2454,7 +2454,7 @@ exitScheduler (rtsBool wait_foreign USED_IF_THREADS)
 	sched_state = SCHED_INTERRUPTING;
         Capability *cap = task->cap;
         waitForReturnCapability(&cap,task);
-        scheduleDoGC(&cap,task,rtsFalse);
+        scheduleDoGC(&cap,task,rtsTrue);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
     }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index a6b8c4af64..ab0ba640c1 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -629,7 +629,16 @@ GarbageCollect (rtsBool force_major_gc,
   }
 
   // Reset the nursery: make the blocks empty
-  allocated += clearNurseries();
+  if (n_gc_threads == 1) {
+      for (n = 0; n < n_capabilities; n++) {
+          allocated += clearNursery(&capabilities[n]);
+      }
+  } else {
+      gct->allocated = clearNursery(cap);
+      for (n = 0; n < n_capabilities; n++) {
+          allocated += gc_threads[n]->allocated;
+      }
+  }
 
   resize_nursery();
 
@@ -1094,6 +1103,8 @@ gcWorkerThread (Capability *cap)
 
     scavenge_until_all_done();
     
+    gct->allocated = clearNursery(cap);
+
 #ifdef THREADED_RTS
     // Now that the whole heap is marked, we discard any sparks that
     // were found to be unreachable.  The main GC thread is currently
@@ -1477,6 +1488,7 @@ init_gc_thread (gc_thread *t)
     t->failed_to_evac = rtsFalse;
     t->eager_promotion = rtsTrue;
     t->thunk_selector_depth = 0;
+    t->allocated = 0;
     t->copied = 0;
     t->scanned = 0;
     t->any_work = 0;
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 60f721285d..1b811e43fc 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -176,6 +176,7 @@ typedef struct gc_thread_ {
     // -------------------
     // stats
 
+    lnat allocated;          // result of clearNursery()
     lnat copied;
     lnat scanned;
     lnat any_work;
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 17798a25b8..18d317d446 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -496,22 +496,19 @@ allocNurseries (nat from, nat to)
     assignNurseriesToCapabilities(from, to);
 }
       
-lnat // words allocated
-clearNurseries (void)
+lnat
+clearNursery (Capability *cap)
 {
-    lnat allocated = 0;
-    nat i;
     bdescr *bd;
+    lnat allocated = 0;
 
-    for (i = 0; i < n_capabilities; i++) {
-        for (bd = nurseries[i].blocks; bd; bd = bd->link) {
-            allocated                       += (lnat)(bd->free - bd->start);
-            capabilities[i].total_allocated += (lnat)(bd->free - bd->start);
-            bd->free = bd->start;
-            ASSERT(bd->gen_no == 0);
-            ASSERT(bd->gen == g0);
-            IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
-        }
+    for (bd = nurseries[cap->no].blocks; bd; bd = bd->link) {
+        allocated            += (lnat)(bd->free - bd->start);
+        cap->total_allocated += (lnat)(bd->free - bd->start);
+        bd->free = bd->start;
+        ASSERT(bd->gen_no == 0);
+        ASSERT(bd->gen == g0);
+        IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
     }
 
     return allocated;
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 44f39ee29b..9dffc18f2d 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -81,7 +81,7 @@ void dirty_MVAR(StgRegTable *reg, StgClosure *p);
 extern nursery *nurseries;
 
 void     resetNurseries       ( void );
-lnat     clearNurseries       ( void );
+lnat     clearNursery         ( Capability *cap );
 void     resizeNurseries      ( nat blocks );
 void     resizeNurseriesFixed ( nat blocks );
 lnat     countNurseryBlocks   ( void );
diff --git a/rules/build-package-data.mk b/rules/build-package-data.mk
index 25b27741e3..817128e08f 100644
--- a/rules/build-package-data.mk
+++ b/rules/build-package-data.mk
@@ -18,11 +18,11 @@ $(call profStart, build-package-data($1,$2,$3))
 # $2 = distdir
 # $3 = GHC stage to use (0 == bootstrapping compiler)
 
-ifeq "$$(filter p,$$(GhcLibWays))" "p"
+ifeq "$$(filter p,$$($1_$2_WAYS))" "p"
 $1_$2_CONFIGURE_OPTS += --enable-library-profiling
 endif
 
-ifeq "$$(filter dyn,$$(GhcLibWays))" "dyn"
+ifeq "$$(filter dyn,$$($1_$2_WAYS))" "dyn"
 $1_$2_CONFIGURE_OPTS += --enable-shared
 endif
 
diff --git a/rules/build-package-way.mk b/rules/build-package-way.mk
index 9919d3b954..176dc9bd83 100644
--- a/rules/build-package-way.mk
+++ b/rules/build-package-way.mk
@@ -69,6 +69,9 @@ $$($1_$2_$3_LIB) : $$($1_$2_$3_ALL_OBJS) $$(ALL_RTS_LIBS) $$($1_$2_$3_DEPS_LIBS)
 	 $$(addprefix -l,$$($1_$2_EXTRA_LIBRARIES)) \
          -no-auto-link-packages \
          -o $$@
+# Now check that the DLL doesn't have too many symbols. See trac #5987.
+	case `$$(OBJDUMP) -p $$@ | sed -n "1,/^.Ordinal\/Name Pointer/ D; p; /^$$$$/ q" | grep "\[ *0\]" | wc -l` in 1) echo DLL $$@ OK;; 0) echo No symbols in DLL $$@; exit 1;; [0-9]*) echo Too many symbols in DLL $$@; exit 1;; *) echo bad DLL $$@; exit 1;; esac
+
 else
 $$($1_$2_$3_LIB) : $$($1_$2_$3_ALL_OBJS) $$(ALL_RTS_LIBS) $$($1_$2_$3_DEPS_LIBS)
 	$$(call cmd,$1_$2_HC) $$($1_$2_$3_ALL_HC_OPTS) $$($1_$2_$3_ALL_OBJS) \
diff --git a/rules/build-package.mk b/rules/build-package.mk
index ccd1659c30..c97f8c4b2c 100644
--- a/rules/build-package.mk
+++ b/rules/build-package.mk
@@ -70,7 +70,7 @@ $(call package-config,$1,$2,$3)
 ifeq "$3" "0"
 $1_$2_WAYS = v
 else
-$1_$2_WAYS = $$(GhcLibWays)
+$1_$2_WAYS = $$(filter-out $$($1_$2_EXCLUDED_WAYS),$$(GhcLibWays))
 endif
 
 # We must use a different dependency file if $(GhcLibWays) changes, so
diff --git a/utils/ghc-pkg/Main.hs b/utils/ghc-pkg/Main.hs
index 3389c07369..ddc4821a07 100644
--- a/utils/ghc-pkg/Main.hs
+++ b/utils/ghc-pkg/Main.hs
@@ -129,10 +129,12 @@ flags = [
         "use the current user's package database",
   Option [] ["global"] (NoArg FlagGlobal)
         "use the global package database",
-  Option ['f'] ["package-db", "package-conf"] (ReqArg FlagConfig "FILE")
-        "use the specified package config file",
-  Option [] ["global-package-db"] (ReqArg FlagGlobalConfig "FILE")
-        "location of the global package config",
+  Option ['f'] ["package-db"] (ReqArg FlagConfig "FILE/DIR")
+        "use the specified package database",
+  Option [] ["package-conf"] (ReqArg FlagConfig "FILE/DIR")
+        "use the specified package database (DEPRECATED)",
+  Option [] ["global-package-db"] (ReqArg FlagGlobalConfig "DIR")
+        "location of the global package database",
   Option [] ["no-user-package-db"] (NoArg FlagNoUserDb)
         "never read the user package database",
   Option [] ["force"] (NoArg FlagForce)
author	Simon Peyton Jones <simonpj@microsoft.com>	2012-07-10 16:04:30 +0100
committer	Simon Peyton Jones <simonpj@microsoft.com>	2012-07-10 16:04:30 +0100
commit	8bbdab1852beda96213ab18d228eb7a1002cedb6 (patch)
tree	3acf0197801b2b9c259149e90ef5e1a7f5f47154
parent	3fe3ef509627cb8c9aa7751beb44d1b4408f8b22 (diff)
parent	713cf473de8a2ad7d0b8195d78860c25fec41839 (diff)
download	haskell-8bbdab1852beda96213ab18d228eb7a1002cedb6.tar.gz