122 files changed, 1421 insertions, 400 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b4a99473c8..2868f8d8eb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,6 +31,11 @@ aliases:
     run:
       name: Configure
       command: ./configure
+  - &configure_unix_32
+    run:
+      name: Configure
+      command: |
+        setarch i386 ./configure --with-ghc=/opt/ghc-i386/8.2.2/bin/ghc
   - &configure_bsd
     run:
       name: Configure
@@ -207,6 +212,24 @@ jobs:
       - *make
       - *test
 
+  "validate-i386-linux":
+    resource_class: xlarge
+    docker:
+      - image: mrkkrp/ghcci-i386-linux:0.0.1
+    environment:
+      <<: *buildenv
+    steps:
+      - checkout
+      - *prepare
+      - *submodules
+      - *boot
+      - *configure_unix_32
+      - *make
+      - *test
+      - *bindist
+      - *collectartifacts
+      - *storeartifacts
+
 workflows:
   version: 2
   validate:
@@ -216,6 +239,7 @@ workflows:
     # - validate-x86_64-freebsd
     - validate-x86_64-darwin
     - validate-x86_64-linux-llvm
+    - validate-i386-linux
     - validate-hadrian-x86_64-linux
 
   nightly:
diff --git a/.circleci/images/i386-linux/Dockerfile b/.circleci/images/i386-linux/Dockerfile
new file mode 100644
index 0000000000..7d3e968195
--- /dev/null
+++ b/.circleci/images/i386-linux/Dockerfile
@@ -0,0 +1,30 @@
+# This Dockerfile tries to replicate haskell:8.2 a bit, but it does so on
+# top of i368/debian:jessie instead of debian:jessie because I had troubles
+# making i386 GHC bindist working there.
+
+FROM i386/debian:jessie
+
+ENV LANG C.UTF-8
+
+# Install the necessary packages, including HVR stuff.
+RUN echo 'deb http://ppa.launchpad.net/hvr/ghc/ubuntu trusty main' > /etc/apt/sources.list.d/ghc.list
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F6F88286
+RUN apt-get update -qq
+RUN apt-get install -qy git make automake autoconf gcc perl python3 texinfo xz-utils lbzip2 bzip2 patch openssh-client sudo curl zlib1g-dev libtinfo-dev libsqlite3-0 libsqlite3-dev ca-certificates g++ cabal-install-2.0 ghc-8.2.2 happy-1.19.5 alex-3.1.7
+ENV PATH /opt/cabal/2.0/bin:/opt/ghc/8.2.2/bin:/opt/happy/1.19.5/bin:/opt/alex/3.1.7/bin:$PATH
+
+# Get i386 GHC bindist for 32 bit CI builds.
+RUN cd /tmp && curl https://downloads.haskell.org/~ghc/8.2.2/ghc-8.2.2-i386-deb8-linux.tar.xz | tar -Jx
+RUN cd /tmp/ghc-8.2.2 && setarch i386 ./configure --prefix=/opt/ghc-i386/8.2.2 CFLAGS=-m32 --target=i386-unknown-linux --build=i386-unknown-linux --host=i386-unknown-linux
+RUN cd /tmp/ghc-8.2.2 && make install
+RUN rm -rf /tmp/ghc-8.2.2
+ENV PATH /opt/ghc-i386/8.2.2/bin:$PATH
+
+# Create a normal user.
+RUN adduser ghc --gecos "GHC builds" --disabled-password
+RUN echo "ghc ALL = NOPASSWD : ALL" > /etc/sudoers.d/ghc
+USER ghc
+
+WORKDIR /home/ghc/
+
+CMD ["bash"]
diff --git a/aclocal.m4 b/aclocal.m4
index 58d43e1ef0..e19cbf2aa9 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1970,7 +1970,7 @@ AC_DEFUN([GHC_CONVERT_OS],[
         $3="openbsd"
         ;;
       # As far as I'm aware, none of these have relevant variants
-      freebsd|netbsd|dragonfly|hpux|linuxaout|kfreebsdgnu|freebsd2|solaris2|mingw32|darwin|nextstep2|nextstep3|sunos4|ultrix|haiku)
+      freebsd|netbsd|dragonfly|hpux|linuxaout|kfreebsdgnu|freebsd2|mingw32|darwin|nextstep2|nextstep3|sunos4|ultrix|haiku)
         $3="$1"
         ;;
       aix*) # e.g. powerpc-ibm-aix7.1.3.0
@@ -1979,6 +1979,9 @@ AC_DEFUN([GHC_CONVERT_OS],[
       darwin*) # e.g. aarch64-apple-darwin14
         $3="darwin"
         ;;
+      solaris2*)
+        $3="solaris2"
+        ;;
       freebsd*) # like i686-gentoo-freebsd7
                 #      i686-gentoo-freebsd8
                 #      i686-gentoo-freebsd8.2
diff --git a/compiler/basicTypes/Name.hs b/compiler/basicTypes/Name.hs
index 02eb0678ee..6941dd9c55 100644
--- a/compiler/basicTypes/Name.hs
+++ b/compiler/basicTypes/Name.hs
@@ -70,7 +70,7 @@ module Name (
         NamedThing(..),
         getSrcLoc, getSrcSpan, getOccString, getOccFS,
 
-        pprInfixName, pprPrefixName, pprModulePrefix,
+        pprInfixName, pprPrefixName, pprModulePrefix, pprNameUnqualified,
         nameStableString,
 
         -- Re-export the OccName stuff
@@ -535,6 +535,10 @@ pprName (Name {n_sort = sort, n_uniq = uniq, n_occ = occ})
       System                  -> pprSystem sty uniq occ
       Internal                -> pprInternal sty uniq occ
 
+-- | Print the string of Name unqualifiedly directly.
+pprNameUnqualified :: Name -> SDoc
+pprNameUnqualified Name { n_occ = occ } = ppr_occ_name occ
+
 pprExternal :: PprStyle -> Unique -> Module -> OccName -> Bool -> BuiltInSyntax -> SDoc
 pprExternal sty uniq mod occ is_wired is_builtin
   | codeStyle sty = ppr mod <> char '_' <> ppr_z_occ_name occ
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index 9f832731b1..50d48afb38 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -1,5 +1,5 @@
 -- Cmm representations using Hoopl's Graph CmmNode e x.
-{-# LANGUAGE CPP, GADTs #-}
+{-# LANGUAGE GADTs #-}
 
 module Cmm (
      -- * Cmm top-level datatypes
diff --git a/compiler/cmm/CmmCommonBlockElim.hs b/compiler/cmm/CmmCommonBlockElim.hs
index fce8f7dae8..c91d553c47 100644
--- a/compiler/cmm/CmmCommonBlockElim.hs
+++ b/compiler/cmm/CmmCommonBlockElim.hs
@@ -64,7 +64,9 @@ elimCommonBlocks :: CmmGraph -> CmmGraph
 elimCommonBlocks g = replaceLabels env $ copyTicks env g
   where
      env = iterate mapEmpty blocks_with_key
-     groups = groupByInt hash_block (postorderDfs g)
+     -- The order of blocks doesn't matter here, but revPostorder also drops any
+     -- unreachable blocks, which is useful.
+     groups = groupByInt hash_block (revPostorder g)
      blocks_with_key = [ [ (successors b, [b]) | b <- bs] | bs <- groups]
 
 -- Invariant: The blocks in the list are pairwise distinct
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index da365cfe7f..9f091da8c2 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -174,10 +174,9 @@ blockConcat splitting_procs g@CmmGraph { g_entry = entry_id }
        | otherwise
        = (entry_id, shortcut_map)
 
-     -- blocks is a list of blocks in DFS postorder, while blockmap is
-     -- a map of blocks. We process each element from blocks and update
-     -- blockmap accordingly
-     blocks = postorderDfs g
+     -- blocks are sorted in reverse postorder, but we want to go from the exit
+     -- towards beginning, so we use foldr below.
+     blocks = revPostorder g
      blockmap = foldl' (flip addBlock) emptyBody blocks
 
      -- Accumulator contains three components:
@@ -435,7 +434,7 @@ removeUnreachableBlocksProc proc@(CmmProc info lbl live g)
                   | otherwise               = env
 
      used_blocks :: [CmmBlock]
-     used_blocks = postorderDfs g
+     used_blocks = revPostorder g
 
      used_lbls :: LabelSet
      used_lbls = setFromList $ map entryLabel used_blocks
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index bae5a739ca..946e146f9e 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -1,5 +1,4 @@
 {-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
index 3f1633404c..d2525d1ffd 100644
--- a/compiler/cmm/CmmLayoutStack.hs
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -244,7 +244,7 @@ cmmLayoutStack dflags procpoints entry_args
     -- We need liveness info. Dead assignments are removed later
     -- by the sinking pass.
     let liveness = cmmLocalLiveness dflags graph
-        blocks = postorderDfs graph
+        blocks = revPostorder graph
 
     (final_stackmaps, _final_high_sp, new_blocks) <-
           mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 6b4d792122..e837d29783 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -422,14 +422,6 @@ That's what the constant-folding operations on comparison operators do above.
 -- -----------------------------------------------------------------------------
 -- Utils
 
-isLit :: CmmExpr -> Bool
-isLit (CmmLit _) = True
-isLit _          = False
-
-isComparisonExpr :: CmmExpr -> Bool
-isComparisonExpr (CmmMachOp op _) = isComparisonMachOp op
-isComparisonExpr _                  = False
-
 isPicReg :: CmmExpr -> Bool
 isPicReg (CmmReg (CmmGlobal PicBaseReg)) = True
 isPicReg _ = False
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index eeae96083a..bef8f384b8 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -190,7 +190,7 @@ minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
 -- Given the set of successors of calls (which must be proc-points)
 -- figure out the minimal set of necessary proc-points
 minimalProcPointSet platform callProcPoints g
-  = extendPPSet platform g (postorderDfs g) callProcPoints
+  = extendPPSet platform g (revPostorder g) callProcPoints
 
 extendPPSet
     :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> UniqSM ProcPointSet
@@ -242,11 +242,11 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
                   (CmmProc (TopInfo {info_tbls = info_tbls})
                            top_l _ g@(CmmGraph {g_entry=entry})) =
   do -- Build a map from procpoints to the blocks they reach
-     let addBlock
+     let add_block
              :: LabelMap (LabelMap CmmBlock)
              -> CmmBlock
              -> LabelMap (LabelMap CmmBlock)
-         addBlock graphEnv b =
+         add_block graphEnv b =
            case mapLookup bid procMap of
              Just ProcPoint -> add graphEnv bid bid b
              Just (ReachedBy set) ->
@@ -265,7 +265,7 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
                          regSetToList $
                          expectJust "ppLiveness" $ mapLookup pp liveness
 
-     graphEnv <- return $ foldlGraphBlocks addBlock mapEmpty g
+     graphEnv <- return $ foldlGraphBlocks add_block mapEmpty g
 
      -- Build a map from proc point BlockId to pairs of:
      --  * Labels for their new procedures
@@ -330,7 +330,7 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
                   -- replace branches to procpoints with branches to jumps
                   blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
                   -- add the jump blocks to the graph
-                  blockEnv''' = foldl' (flip insertBlock) blockEnv'' jumpBlocks
+                  blockEnv''' = foldl' (flip addBlock) blockEnv'' jumpBlocks
               let g' = ofBlockMap ppId blockEnv'''
               -- pprTrace "g' pre jumps" (ppr g') $ do
               return (mapInsert ppId g' newGraphEnv)
@@ -374,8 +374,8 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
      -- reversed later.
      let (_, block_order) =
              foldl' add_block_num (0::Int, mapEmpty :: LabelMap Int)
-                    (postorderDfs g)
-         add_block_num (!i, !map) block =
+                   (revPostorder g)
+         add_block_num (i, map) block =
            (i + 1, mapInsert (entryLabel block) i map)
          sort_fn (bid, _) (bid', _) =
            compare (expectJust "block_order" $ mapLookup bid  block_order)
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 487f0bc244..43444639e1 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -173,7 +173,7 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
   liveness = cmmLocalLiveness dflags graph
   getLive l = mapFindWithDefault Set.empty l liveness
 
-  blocks = postorderDfs graph
+  blocks = revPostorder graph
 
   join_pts = findJoinPoints blocks
 
@@ -458,17 +458,7 @@ tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
         occurs_once = not l_live && l_usages == Just 1
         occurs_none = not l_live && l_usages == Nothing
 
-        inl_node = case mapExpDeep inl_exp node of
-                     -- See Note [Improving conditionals]
-                     CmmCondBranch (CmmMachOp (MO_Ne w) args)
-                                   ti fi l
-                           -> CmmCondBranch (cmmMachOpFold dflags (MO_Eq w) args)
-                                            fi ti (inv_likeliness l)
-                     node' -> node'
-
-        inv_likeliness :: Maybe Bool -> Maybe Bool
-        inv_likeliness Nothing  = Nothing
-        inv_likeliness (Just l) = Just (not l)
+        inl_node = improveConditional (mapExpDeep inl_exp node)
 
         inl_exp :: CmmExpr -> CmmExpr
         -- inl_exp is where the inlining actually takes place!
@@ -479,22 +469,43 @@ tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
         inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
         inl_exp other = other
 
-{- Note [Improving conditionals]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Given
-  CmmCondBranch ((a >## b) != 1) t f
-where a,b, are Floats, the constant folder /cannot/ turn it into
-  CmmCondBranch (a <=## b) t f
-because comparison on floats are not invertible
-(see CmmMachOp.maybeInvertComparison).
 
-What we want instead is simply to reverse the true/false branches thus
+{- Note [improveConditional]
+
+cmmMachOpFold tries to simplify conditionals to turn things like
+  (a == b) != 1
+into
+  (a != b)
+but there's one case it can't handle: when the comparison is over
+floating-point values, we can't invert it, because floating-point
+comparisions aren't invertible (because NaN).
+
+But we *can* optimise this conditional by swapping the true and false
+branches. Given
   CmmCondBranch ((a >## b) != 1) t f
--->
+we can turn it into
   CmmCondBranch (a >## b) f t
 
-And we do that right here in tryToInline, just as we do cmmMachOpFold.
+So here we catch conditionals that weren't optimised by cmmMachOpFold,
+and apply above transformation to eliminate the comparison against 1.
+
+It's tempting to just turn every != into == and then let cmmMachOpFold
+do its thing, but that risks changing a nice fall-through conditional
+into one that requires two jumps. (see swapcond_last in
+CmmContFlowOpt), so instead we carefully look for just the cases where
+we can eliminate a comparison.
 -}
+improveConditional :: CmmNode O x -> CmmNode O x
+improveConditional
+  (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
+  | neLike mop, isComparisonExpr x
+  = CmmCondBranch x f t (fmap not l)
+  where
+    neLike (MO_Ne _) = True
+    neLike (MO_U_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
+    neLike (MO_S_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
+    neLike _ = False
+improveConditional other = other
 
 -- Note [dependent assignments]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 4a1d874d8f..53dbcddfbb 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE CPP, GADTs, RankNTypes #-}
+{-# LANGUAGE GADTs, RankNTypes #-}
 
 -----------------------------------------------------------------------------
 --
@@ -35,7 +35,7 @@ module CmmUtils(
         cmmSubWord, cmmAddWord, cmmMulWord, cmmQuotWord,
         cmmToWord,
 
-        isTrivialCmmExpr, hasNoGlobalRegs,
+        isTrivialCmmExpr, hasNoGlobalRegs, isLit, isComparisonExpr,
 
         baseExpr, spExpr, hpExpr, spLimExpr, hpLimExpr,
         currentTSOExpr, currentNurseryExpr, cccsExpr,
@@ -56,17 +56,15 @@ module CmmUtils(
         -- * Operations that probably don't belong here
         modifyGraph,
 
-        ofBlockMap, toBlockMap, insertBlock,
+        ofBlockMap, toBlockMap,
         ofBlockList, toBlockList, bodyToBlockList,
         toBlockListEntryFirst, toBlockListEntryFirstFalseFallthrough,
-        foldlGraphBlocks, mapGraphNodes, postorderDfs, mapGraphNodes1,
+        foldlGraphBlocks, mapGraphNodes, revPostorder, mapGraphNodes1,
 
         -- * Ticks
         blockTicks
   ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude
 
 import TyCon    ( PrimRep(..), PrimElemRep(..) )
@@ -78,11 +76,9 @@ import BlockId
 import CLabel
 import Outputable
 import DynFlags
-import Util
 import CodeGen.Platform
 
 import Data.Word
-import Data.Maybe
 import Data.Bits
 import Hoopl.Graph
 import Hoopl.Label
@@ -389,6 +385,14 @@ hasNoGlobalRegs (CmmReg (CmmLocal _))      = True
 hasNoGlobalRegs (CmmRegOff (CmmLocal _) _) = True
 hasNoGlobalRegs _ = False
 
+isLit :: CmmExpr -> Bool
+isLit (CmmLit _) = True
+isLit _          = False
+
+isComparisonExpr :: CmmExpr -> Bool
+isComparisonExpr (CmmMachOp op _) = isComparisonMachOp op
+isComparisonExpr _                  = False
+
 ---------------------------------------------------
 --
 --      Tagging
@@ -487,12 +491,6 @@ toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
 ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
 
-insertBlock :: CmmBlock -> LabelMap CmmBlock -> LabelMap CmmBlock
-insertBlock block map =
-  ASSERT(isNothing $ mapLookup id map)
-  mapInsert id block map
-  where id = entryLabel block
-
 toBlockList :: CmmGraph -> [CmmBlock]
 toBlockList g = mapElems $ toBlockMap g
 
@@ -558,8 +556,9 @@ mapGraphNodes1 f = modifyGraph (mapGraph f)
 foldlGraphBlocks :: (a -> CmmBlock -> a) -> a -> CmmGraph -> a
 foldlGraphBlocks k z g = mapFoldl k z $ toBlockMap g
 
-postorderDfs :: CmmGraph -> [CmmBlock]
-postorderDfs g = {-# SCC "postorderDfs" #-} postorder_dfs_from (toBlockMap g) (g_entry g)
+revPostorder :: CmmGraph -> [CmmBlock]
+revPostorder g = {-# SCC "revPostorder" #-}
+    revPostorderFrom (toBlockMap g) (g_entry g)
 
 -------------------------------------------------
 -- Tick utilities
diff --git a/compiler/cmm/Hoopl/Collections.hs b/compiler/cmm/Hoopl/Collections.hs
index b8072b37a7..ef7de4a078 100644
--- a/compiler/cmm/Hoopl/Collections.hs
+++ b/compiler/cmm/Hoopl/Collections.hs
@@ -12,7 +12,7 @@ module Hoopl.Collections
 
 import GhcPrelude
 
-import qualified Data.IntMap as M
+import qualified Data.IntMap.Strict as M
 import qualified Data.IntSet as S
 
 import Data.List (foldl', foldl1')
@@ -66,6 +66,7 @@ class IsMap map where
   mapInsert :: KeyOf map -> a -> map a -> map a
   mapInsertWith :: (a -> a -> a) -> KeyOf map -> a -> map a -> map a
   mapDelete :: KeyOf map -> map a -> map a
+  mapAlter :: (Maybe a -> Maybe a) -> KeyOf map -> map a -> map a
 
   mapUnion :: map a -> map a -> map a
   mapUnionWithKey :: (KeyOf map -> a -> a -> a) -> map a -> map a -> map a
@@ -143,6 +144,7 @@ instance IsMap UniqueMap where
   mapInsert k v (UM m) = UM (M.insert k v m)
   mapInsertWith f k v (UM m) = UM (M.insertWith f k v m)
   mapDelete k (UM m) = UM (M.delete k m)
+  mapAlter f k (UM m) = UM (M.alter f k m)
 
   mapUnion (UM x) (UM y) = UM (M.union x y)
   mapUnionWithKey f (UM x) (UM y) = UM (M.unionWithKey f x y)
diff --git a/compiler/cmm/Hoopl/Dataflow.hs b/compiler/cmm/Hoopl/Dataflow.hs
index 0b0434bb36..2538b70ee3 100644
--- a/compiler/cmm/Hoopl/Dataflow.hs
+++ b/compiler/cmm/Hoopl/Dataflow.hs
@@ -111,8 +111,7 @@ analyzeCmm dir lattice transfer cmmGraph initFact =
         blockMap =
             case hooplGraph of
                 GMany NothingO bm NothingO -> bm
-        entries = if mapNull initFact then [entry] else mapKeys initFact
-    in fixpointAnalysis dir lattice transfer entries blockMap initFact
+    in fixpointAnalysis dir lattice transfer entry blockMap initFact
 
 -- Fixpoint algorithm.
 fixpointAnalysis
@@ -120,16 +119,16 @@ fixpointAnalysis
        Direction
     -> DataflowLattice f
     -> TransferFun f
-    -> [Label]
+    -> Label
     -> LabelMap CmmBlock
     -> FactBase f
     -> FactBase f
-fixpointAnalysis direction lattice do_block entries blockmap = loop start
+fixpointAnalysis direction lattice do_block entry blockmap = loop start
   where
     -- Sorting the blocks helps to minimize the number of times we need to
     -- process blocks. For instance, for forward analysis we want to look at
     -- blocks in reverse postorder. Also, see comments for sortBlocks.
-    blocks     = sortBlocks direction entries blockmap
+    blocks     = sortBlocks direction entry blockmap
     num_blocks = length blocks
     block_arr  = {-# SCC "block_arr" #-} listArray (0, num_blocks - 1) blocks
     start      = {-# SCC "start" #-} IntSet.fromDistinctAscList
@@ -174,9 +173,8 @@ rewriteCmm dir lattice rwFun cmmGraph initFact = do
         blockMap1 =
             case hooplGraph of
                 GMany NothingO bm NothingO -> bm
-        entries = if mapNull initFact then [entry] else mapKeys initFact
     (blockMap2, facts) <-
-        fixpointRewrite dir lattice rwFun entries blockMap1 initFact
+        fixpointRewrite dir lattice rwFun entry blockMap1 initFact
     return (cmmGraph {g_graph = GMany NothingO blockMap2 NothingO}, facts)
 
 fixpointRewrite
@@ -184,16 +182,16 @@ fixpointRewrite
        Direction
     -> DataflowLattice f
     -> RewriteFun f
-    -> [Label]
+    -> Label
     -> LabelMap CmmBlock
     -> FactBase f
     -> UniqSM (LabelMap CmmBlock, FactBase f)
-fixpointRewrite dir lattice do_block entries blockmap = loop start blockmap
+fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap
   where
     -- Sorting the blocks helps to minimize the number of times we need to
     -- process blocks. For instance, for forward analysis we want to look at
     -- blocks in reverse postorder. Also, see comments for sortBlocks.
-    blocks     = sortBlocks dir entries blockmap
+    blocks     = sortBlocks dir entry blockmap
     num_blocks = length blocks
     block_arr  = {-# SCC "block_arr_rewrite" #-}
                  listArray (0, num_blocks - 1) blocks
@@ -268,20 +266,15 @@ we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
 -- | Sort the blocks into the right order for analysis. This means reverse
 -- postorder for a forward analysis. For the backward one, we simply reverse
 -- that (see Note [Backward vs forward analysis]).
---
--- Note: We're using Hoopl's confusingly named `postorder_dfs_from` but AFAICS
--- it returns the *reverse* postorder of the blocks (it visits blocks in the
--- postorder and uses (:) to collect them, which gives the reverse of the
--- visitation order).
 sortBlocks
     :: NonLocal n
-    => Direction -> [Label] -> LabelMap (Block n C C) -> [Block n C C]
-sortBlocks direction entries blockmap =
+    => Direction -> Label -> LabelMap (Block n C C) -> [Block n C C]
+sortBlocks direction entry blockmap =
     case direction of
         Fwd -> fwd
         Bwd -> reverse fwd
   where
-    fwd = postorder_dfs_from blockmap entries
+    fwd = revPostorderFrom blockmap entry
 
 -- Note [Backward vs forward analysis]
 --
diff --git a/compiler/cmm/Hoopl/Graph.hs b/compiler/cmm/Hoopl/Graph.hs
index ca482ab4a8..0142f70c76 100644
--- a/compiler/cmm/Hoopl/Graph.hs
+++ b/compiler/cmm/Hoopl/Graph.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE RankNTypes #-}
@@ -14,11 +15,12 @@ module Hoopl.Graph
     , labelsDefined
     , mapGraph
     , mapGraphBlocks
-    , postorder_dfs_from
+    , revPostorderFrom
     ) where
 
 
 import GhcPrelude
+import Util
 
 import Hoopl.Label
 import Hoopl.Block
@@ -51,13 +53,14 @@ emptyBody = mapEmpty
 bodyList :: Body' block n -> [(Label,block n C C)]
 bodyList body = mapToList body
 
-addBlock :: NonLocal thing
-         => thing C C -> LabelMap (thing C C)
-         -> LabelMap (thing C C)
-addBlock b body
-  | mapMember lbl body = error $ "duplicate label " ++ show lbl ++ " in graph"
-  | otherwise          = mapInsert lbl b body
-  where lbl = entryLabel b
+addBlock
+    :: (NonLocal block, HasDebugCallStack)
+    => block C C -> LabelMap (block C C) -> LabelMap (block C C)
+addBlock block body = mapAlter add lbl body
+  where
+    lbl = entryLabel block
+    add Nothing = Just block
+    add _ = error $ "duplicate label " ++ show lbl ++ " in graph"
 
 
 -- ---------------------------------------------------------------------------
@@ -119,22 +122,10 @@ labelsDefined (GMany _ body x) = mapFoldlWithKey addEntry (exitLabel x) body
 
 ----------------------------------------------------------------
 
-class LabelsPtr l where
-  targetLabels :: l -> [Label]
-
-instance NonLocal n => LabelsPtr (n e C) where
-  targetLabels n = successors n
-
-instance LabelsPtr Label where
-  targetLabels l = [l]
-
-instance LabelsPtr LabelSet where
-  targetLabels = setElems
-
-instance LabelsPtr l => LabelsPtr [l] where
-  targetLabels = concatMap targetLabels
-
--- | This is the most important traversal over this data structure.  It drops
+-- | Returns a list of blocks reachable from the provided Labels in the reverse
+-- postorder.
+--
+-- This is the most important traversal over this data structure.  It drops
 -- unreachable code and puts blocks in an order that is good for solving forward
 -- dataflow problems quickly.  The reverse order is good for solving backward
 -- dataflow problems quickly.  The forward order is also reasonably good for
@@ -143,59 +134,52 @@ instance LabelsPtr l => LabelsPtr [l] where
 -- that you would need a more serious analysis, probably based on dominators, to
 -- identify loop headers.
 --
--- The ubiquity of 'postorder_dfs' is one reason for the ubiquity of the 'LGraph'
--- representation, when for most purposes the plain 'Graph' representation is
--- more mathematically elegant (but results in more complicated code).
---
--- Here's an easy way to go wrong!  Consider
+-- For forward analyses we want reverse postorder visitation, consider:
 -- @
 --      A -> [B,C]
 --      B -> D
 --      C -> D
 -- @
--- Then ordinary dfs would give [A,B,D,C] which has a back ref from C to D.
--- Better to get [A,B,C,D]
-
-
--- | Traversal: 'postorder_dfs' returns a list of blocks reachable
--- from the entry of enterable graph. The entry and exit are *not* included.
--- The list has the following property:
---
---      Say a "back reference" exists if one of a block's
---      control-flow successors precedes it in the output list
---
---      Then there are as few back references as possible
---
--- The output is suitable for use in
--- a forward dataflow problem.  For a backward problem, simply reverse
--- the list.  ('postorder_dfs' is sufficiently tricky to implement that
--- one doesn't want to try and maintain both forward and backward
--- versions.)
-
-postorder_dfs_from_except :: forall block e . (NonLocal block, LabelsPtr e)
-                          => LabelMap (block C C) -> e -> LabelSet -> [block C C]
-postorder_dfs_from_except blocks b visited =
- vchildren (get_children b) (\acc _visited -> acc) [] visited
- where
-   vnode :: block C C -> ([block C C] -> LabelSet -> a) -> [block C C] -> LabelSet -> a
-   vnode block cont acc visited =
-        if setMember id visited then
-            cont acc visited
-        else
-            let cont' acc visited = cont (block:acc) visited in
-            vchildren (get_children block) cont' acc (setInsert id visited)
-      where id = entryLabel block
-   vchildren :: forall a. [block C C] -> ([block C C] -> LabelSet -> a) -> [block C C] -> LabelSet -> a
-   vchildren bs cont acc visited = next bs acc visited
-      where next children acc visited =
-                case children of []     -> cont acc visited
-                                 (b:bs) -> vnode b (next bs) acc visited
-   get_children :: forall l. LabelsPtr l => l -> [block C C]
-   get_children block = foldr add_id [] $ targetLabels block
-   add_id id rst = case lookupFact id blocks of
-                      Just b -> b : rst
-                      Nothing -> rst
-
-postorder_dfs_from
-    :: (NonLocal block, LabelsPtr b) => LabelMap (block C C) -> b -> [block C C]
-postorder_dfs_from blocks b = postorder_dfs_from_except blocks b setEmpty
+-- Postorder: [D, C, B, A] (or [D, B, C, A])
+-- Reverse postorder: [A, B, C, D] (or [A, C, B, D])
+-- This matters for, e.g., forward analysis, because we want to analyze *both*
+-- B and C before we analyze D.
+revPostorderFrom
+  :: forall block.  (NonLocal block)
+  => LabelMap (block C C) -> Label -> [block C C]
+revPostorderFrom graph start = go start_worklist setEmpty []
+  where
+    start_worklist = lookup_for_descend start Nil
+
+    -- To compute the postorder we need to "visit" a block (mark as done)
+    -- *after* visiting all its successors. So we need to know whether we
+    -- already processed all successors of each block (and @NonLocal@ allows
+    -- arbitrary many successors). So we use an explicit stack with an extra bit
+    -- of information:
+    -- * @ConsTodo@ means to explore the block if it wasn't visited before
+    -- * @ConsMark@ means that all successors were already done and we can add
+    --   the block to the result.
+    --
+    -- NOTE: We add blocks to the result list in postorder, but we *prepend*
+    -- them (i.e., we use @(:)@), which means that the final list is in reverse
+    -- postorder.
+    go :: DfsStack (block C C) -> LabelSet -> [block C C] -> [block C C]
+    go Nil                      !_           !result = result
+    go (ConsMark block rest)    !wip_or_done !result =
+        go rest wip_or_done (block : result)
+    go (ConsTodo block rest)    !wip_or_done !result
+        | entryLabel block `setMember` wip_or_done = go rest wip_or_done result
+        | otherwise =
+            let new_worklist =
+                    foldr lookup_for_descend
+                          (ConsMark block rest)
+                          (successors block)
+            in go new_worklist (setInsert (entryLabel block) wip_or_done) result
+
+    lookup_for_descend :: Label -> DfsStack (block C C) -> DfsStack (block C C)
+    lookup_for_descend label wl
+      | Just b <- mapLookup label graph = ConsTodo b wl
+      | otherwise =
+           error $ "Label that doesn't have a block?! " ++ show label
+
+data DfsStack a = ConsTodo a (DfsStack a) | ConsMark a (DfsStack a) | Nil
diff --git a/compiler/cmm/Hoopl/Label.hs b/compiler/cmm/Hoopl/Label.hs
index 8096fab073..6eae115779 100644
--- a/compiler/cmm/Hoopl/Label.hs
+++ b/compiler/cmm/Hoopl/Label.hs
@@ -87,6 +87,7 @@ instance IsMap LabelMap where
   mapInsert (Label k) v (LM m) = LM (mapInsert k v m)
   mapInsertWith f (Label k) v (LM m) = LM (mapInsertWith f k v m)
   mapDelete (Label k) (LM m) = LM (mapDelete k m)
+  mapAlter f (Label k) (LM m) = LM (mapAlter f k m)
 
   mapUnion (LM x) (LM y) = LM (mapUnion x y)
   mapUnionWithKey f (LM x) (LM y) = LM (mapUnionWithKey (f . mkHooplLabel) x y)
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index d9f140254c..70229d067d 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns, CPP, GADTs #-}
+{-# LANGUAGE BangPatterns, GADTs #-}
 
 module MkGraph
   ( CmmAGraph, CmmAGraphScoped, CgStmt(..)
@@ -21,7 +21,7 @@ module MkGraph
   )
 where
 
-import GhcPrelude (($),Int,Bool,Eq(..)) -- avoid importing (<*>)
+import GhcPrelude hiding ( (<*>) ) -- avoid importing (<*>)
 
 import BlockId
 import Cmm
@@ -37,10 +37,7 @@ import ForeignCall
 import OrdList
 import SMRep (ByteOff)
 import UniqSupply
-
-import Control.Monad
-import Data.List
-import Data.Maybe
+import Util
 
 
 -----------------------------------------------------------------------------
@@ -184,12 +181,10 @@ mkNop        :: CmmAGraph
 mkNop         = nilOL
 
 mkComment    :: FastString -> CmmAGraph
-#if defined(DEBUG)
--- SDM: generating all those comments takes time, this saved about 4% for me
-mkComment fs  = mkMiddle $ CmmComment fs
-#else
-mkComment _   = nilOL
-#endif
+mkComment fs
+  -- SDM: generating all those comments takes time, this saved about 4% for me
+  | debugIsOn = mkMiddle $ CmmComment fs
+  | otherwise = nilOL
 
 ---------- Assignment and store
 mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 6a93ea818e..c9a6003aaf 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -141,8 +141,8 @@ pprCmmGraph g
    = text "{" <> text "offset"
   $$ nest 2 (vcat $ map ppr blocks)
   $$ text "}"
-  where blocks = postorderDfs g
-    -- postorderDfs has the side-effect of discarding unreachable code,
+  where blocks = revPostorder g
+    -- revPostorder has the side-effect of discarding unreachable code,
     -- so pretty-printed Cmm will omit any unreachable blocks.  This can
     -- sometimes be confusing.
 
diff --git a/compiler/cmm/PprCmmDecl.hs b/compiler/cmm/PprCmmDecl.hs
index 9b3cecc3b8..9dd2332b67 100644
--- a/compiler/cmm/PprCmmDecl.hs
+++ b/compiler/cmm/PprCmmDecl.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 ----------------------------------------------------------------------------
 --
 -- Pretty-printing of common Cmm types
@@ -54,7 +52,6 @@ import System.IO
 
 -- Temp Jan08
 import SMRep
-#include "../includes/rts/storage/FunTypes.h"
 
 
 pprCmms :: (Outputable info, Outputable g)
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index c20f1fd1d0..6a2840294a 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE CPP, GADTs #-}
+{-# LANGUAGE GADTs #-}
 
 -----------------------------------------------------------------------------
 --
@@ -10,8 +10,6 @@
 
 module CgUtils ( fixStgRegisters ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude
 
 import CodeGen.Platform
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs
index 9ef552d336..b29394da6f 100644
--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 -----------------------------------------------------------------------------
 --
 -- Stg to C-- code generation: bindings
@@ -15,8 +13,6 @@ module StgCmmBind (
         pushUpdateFrame, emitUpdateFrame
   ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude hiding ((<*>))
 
 import StgCmmExpr
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs
index 3fcc935121..22fcfaf412 100644
--- a/compiler/codeGen/StgCmmExpr.hs
+++ b/compiler/codeGen/StgCmmExpr.hs
@@ -1,5 +1,4 @@
 {-# LANGUAGE CPP #-}
-{-# OPTIONS_GHC -fno-warn-unused-do-bind #-}
 
 -----------------------------------------------------------------------------
 --
@@ -61,7 +60,8 @@ cgExpr  :: StgExpr -> FCode ReturnKind
 
 cgExpr (StgApp fun args)     = cgIdApp fun args
 
-{- seq# a s ==> a -}
+-- seq# a s ==> a
+-- See Note [seq# magic] in PrelRules
 cgExpr (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _res_ty) =
   cgIdApp a []
 
@@ -409,7 +409,8 @@ cgCase (StgApp v []) bndr alt_type@(PrimAlt _) alts
        ; v_info <- getCgIdInfo v
        ; emitAssign (CmmLocal (idToReg dflags (NonVoid bndr)))
                     (idInfoToAmode v_info)
-       ; bindArgToReg (NonVoid bndr)
+       -- Add bndr to the environment
+       ; _ <- bindArgToReg (NonVoid bndr)
        ; cgAlts (NoGcInAlts,AssignedDirectly) (NonVoid bndr) alt_type alts }
   where
     reps_compatible = ((==) `on` (primRepSlot . idPrimRep)) v bndr
@@ -435,7 +436,8 @@ it would be better to invoke some kind of panic function here.
 cgCase scrut@(StgApp v []) _ (PrimAlt _) _
   = do { dflags <- getDynFlags
        ; mb_cc <- maybeSaveCostCentre True
-       ; withSequel (AssignTo [idToReg dflags (NonVoid v)] False) (cgExpr scrut)
+       ; _ <- withSequel
+                  (AssignTo [idToReg dflags (NonVoid v)] False) (cgExpr scrut)
        ; restoreCurrentCostCentre mb_cc
        ; emitComment $ mkFastString "should be unreachable code"
        ; l <- newBlockId
@@ -446,13 +448,14 @@ cgCase scrut@(StgApp v []) _ (PrimAlt _) _
 
 {- Note [Handle seq#]
 ~~~~~~~~~~~~~~~~~~~~~
-case seq# a s of v
-  (# s', a' #) -> e
+See Note [seq# magic] in PrelRules.
+The special case for seq# in cgCase does this:
 
+  case seq# a s of v
+    (# s', a' #) -> e
 ==>
-
-case a of v
-  (# s', a' #) -> e
+  case a of v
+    (# s', a' #) -> e
 
 (taking advantage of the fact that the return convention for (# State#, a #)
 is the same as the return convention for just 'a')
@@ -460,6 +463,7 @@ is the same as the return convention for just 'a')
 
 cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr alt_type alts
   = -- Note [Handle seq#]
+    -- And see Note [seq# magic] in PrelRules
     -- Use the same return convention as vanilla 'a'.
     cgCase (StgApp a []) bndr alt_type alts
 
diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs
index d0ad17f59b..c1103e7d77 100644
--- a/compiler/codeGen/StgCmmForeign.hs
+++ b/compiler/codeGen/StgCmmForeign.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 -----------------------------------------------------------------------------
 --
 -- Code generation for foreign calls.
@@ -20,8 +18,6 @@ module StgCmmForeign (
   emitCloseNursery,
  ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude hiding( succ, (<*>) )
 
 import StgSyn
@@ -408,8 +404,8 @@ Opening the nursery corresponds to the following code:
 @
    tso = CurrentTSO;
    cn = CurrentNursery;
-   bdfree = CurrentNuresry->free;
-   bdstart = CurrentNuresry->start;
+   bdfree = CurrentNursery->free;
+   bdstart = CurrentNursery->start;
 
    // We *add* the currently occupied portion of the nursery block to
    // the allocation limit, because we will subtract it again in
diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs
index 07633ed4ae..3be35b35fa 100644
--- a/compiler/codeGen/StgCmmHeap.hs
+++ b/compiler/codeGen/StgCmmHeap.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 -----------------------------------------------------------------------------
 --
 -- Stg to C--: heap management functions
@@ -22,8 +20,6 @@ module StgCmmHeap (
         emitSetDynHdr
     ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude hiding ((<*>))
 
 import StgSyn
diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs
index 7c3864296c..cc941a2e57 100644
--- a/compiler/codeGen/StgCmmMonad.hs
+++ b/compiler/codeGen/StgCmmMonad.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE CPP, GADTs, UnboxedTuples #-}
+{-# LANGUAGE GADTs, UnboxedTuples #-}
 
 -----------------------------------------------------------------------------
 --
@@ -58,8 +58,6 @@ module StgCmmMonad (
         CgInfoDownwards(..), CgState(..)        -- non-abstract
     ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude hiding( sequence, succ )
 
 import Cmm
@@ -79,6 +77,7 @@ import Unique
 import UniqSupply
 import FastString
 import Outputable
+import Util
 
 import Control.Monad
 import Data.List
@@ -696,11 +695,9 @@ emitLabel id = do tscope <- getTickScope
                   emitCgStmt (CgLabel id tscope)
 
 emitComment :: FastString -> FCode ()
-#if 0 /* def DEBUG */
-emitComment s = emitCgStmt (CgStmt (CmmComment s))
-#else
-emitComment _ = return ()
-#endif
+emitComment s
+  | debugIsOn = emitCgStmt (CgStmt (CmmComment s))
+  | otherwise = return ()
 
 emitTick :: CmmTickish -> FCode ()
 emitTick = emitCgStmt . CgStmt . CmmTick
diff --git a/compiler/codeGen/StgCmmProf.hs b/compiler/codeGen/StgCmmProf.hs
index a0bca5d661..15c31ca59c 100644
--- a/compiler/codeGen/StgCmmProf.hs
+++ b/compiler/codeGen/StgCmmProf.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 -----------------------------------------------------------------------------
 --
 -- Code generation for profiling
@@ -25,8 +23,6 @@ module StgCmmProf (
         ldvEnter, ldvEnterClosure, ldvRecordCreate
   ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude
 
 import StgCmmClosure
diff --git a/compiler/codeGen/StgCmmTicky.hs b/compiler/codeGen/StgCmmTicky.hs
index a7d158ce3a..8f3074856a 100644
--- a/compiler/codeGen/StgCmmTicky.hs
+++ b/compiler/codeGen/StgCmmTicky.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns, CPP #-}
+{-# LANGUAGE BangPatterns #-}
 
 -----------------------------------------------------------------------------
 --
@@ -104,8 +104,6 @@ module StgCmmTicky (
   tickySlowCall, tickySlowCallPat,
   ) where
 
-#include "HsVersions.h"
-
 import GhcPrelude
 
 import StgCmmArgRep    ( slowCallPattern , toArgRep , argRepString )
diff --git a/compiler/coreSyn/CoreUtils.hs b/compiler/coreSyn/CoreUtils.hs
index 4db9d8fc29..5608afc334 100644
--- a/compiler/coreSyn/CoreUtils.hs
+++ b/compiler/coreSyn/CoreUtils.hs
@@ -1449,8 +1449,11 @@ app_ok primop_ok fun args
               -- Often there is a literal divisor, and this
               -- can get rid of a thunk in an inner loop
 
+        | SeqOp <- op    -- See Note [seq# and expr_ok]
+        -> all (expr_ok primop_ok) args
+
         | otherwise
-        -> primop_ok op     -- Check the primop itself
+        -> primop_ok op  -- Check the primop itself
         && and (zipWith arg_ok arg_tys args)  -- Check the arguments
 
       _other -> isUnliftedType (idType fun)          -- c.f. the Var case of exprIsHNF
@@ -1607,6 +1610,25 @@ See also Note [dataToTag#] in primops.txt.pp.
 
 Bottom line:
   * in exprOkForSpeculation we simply ignore all lifted arguments.
+  * except see Note [seq# and expr_ok] for an exception
+
+
+Note [seq# and expr_ok]
+~~~~~~~~~~~~~~~~~~~~~~~
+Recall that
+   seq# :: forall a s . a -> State# s -> (# State# s, a #)
+must always evaluate its first argument.  So it's really a
+counter-example to Note [Primops with lifted arguments]. In
+the case of seq# we must check the argument to seq#.  Remember
+item (d) of the specification of exprOkForSpeculation:
+
+  -- Precisely, it returns @True@ iff:
+  --  a) The expression guarantees to terminate,
+         ...
+  --  d) without throwing a Haskell exception
+
+The lack of this special case caused Trac #5129 to go bad again.
+See comment:24 and following
 
 
 ************************************************************************
diff --git a/compiler/hsSyn/Convert.hs b/compiler/hsSyn/Convert.hs
index 531f146a9d..644075810c 100644
--- a/compiler/hsSyn/Convert.hs
+++ b/compiler/hsSyn/Convert.hs
@@ -1625,8 +1625,14 @@ thRdrName loc ctxt_ns th_occ th_name
     occ :: OccName.OccName
     occ = mk_occ ctxt_ns th_occ
 
+-- Return an unqualified exact RdrName if we're dealing with built-in syntax.
+-- See Trac #13776.
 thOrigRdrName :: String -> TH.NameSpace -> PkgName -> ModName -> RdrName
-thOrigRdrName occ th_ns pkg mod = (mkOrig $! (mkModule (mk_pkg pkg) (mk_mod mod))) $! (mk_occ (mk_ghc_ns th_ns) occ)
+thOrigRdrName occ th_ns pkg mod =
+  let occ' = mk_occ (mk_ghc_ns th_ns) occ
+  in case isBuiltInOcc_maybe occ' of
+       Just name -> nameRdrName name
+       Nothing   -> (mkOrig $! (mkModule (mk_pkg pkg) (mk_mod mod))) $! occ'
 
 thRdrNameGuesses :: TH.Name -> [RdrName]
 thRdrNameGuesses (TH.Name occ flavour)
diff --git a/compiler/iface/IfaceType.hs b/compiler/iface/IfaceType.hs
index 0c5922eb53..d0adce99ee 100644
--- a/compiler/iface/IfaceType.hs
+++ b/compiler/iface/IfaceType.hs
@@ -257,6 +257,10 @@ data IfaceCoercion
   | IfaceForAllCo     IfaceTvBndr IfaceCoercion IfaceCoercion
   | IfaceCoVarCo      IfLclName
   | IfaceAxiomInstCo  IfExtName BranchIndex [IfaceCoercion]
+  | IfaceAxiomRuleCo  IfLclName [IfaceCoercion]
+       -- There are only a fixed number of CoAxiomRules, so it suffices
+       -- to use an IfaceLclName to distinguish them.
+       -- See Note [Adding built-in type families] in TcTypeNats
   | IfaceUnivCo       IfaceUnivCoProv Role IfaceType IfaceType
   | IfaceSymCo        IfaceCoercion
   | IfaceTransCo      IfaceCoercion IfaceCoercion
@@ -266,7 +270,6 @@ data IfaceCoercion
   | IfaceCoherenceCo  IfaceCoercion IfaceCoercion
   | IfaceKindCo       IfaceCoercion
   | IfaceSubCo        IfaceCoercion
-  | IfaceAxiomRuleCo  IfLclName [IfaceCoercion]
   | IfaceFreeCoVar    CoVar    -- See Note [Free tyvars in IfaceType]
   | IfaceHoleCo       CoVar    -- ^ See Note [Holes in IfaceCoercion]
 
diff --git a/compiler/iface/TcIface.hs b/compiler/iface/TcIface.hs
index 70438f6337..ca1a17dba4 100644
--- a/compiler/iface/TcIface.hs
+++ b/compiler/iface/TcIface.hs
@@ -1359,7 +1359,7 @@ tcIfaceCo = go
                                                <*> go c2
     go (IfaceKindCo c)           = KindCo   <$> go c
     go (IfaceSubCo c)            = SubCo    <$> go c
-    go (IfaceAxiomRuleCo ax cos) = AxiomRuleCo <$> go_axiom_rule ax
+    go (IfaceAxiomRuleCo ax cos) = AxiomRuleCo <$> tcIfaceCoAxiomRule ax
                                                <*> mapM go cos
     go (IfaceFreeCoVar c)        = pprPanic "tcIfaceCo:IfaceFreeCoVar" (ppr c)
     go (IfaceHoleCo c)           = pprPanic "tcIfaceCo:IfaceHoleCo"    (ppr c)
@@ -1367,12 +1367,6 @@ tcIfaceCo = go
     go_var :: FastString -> IfL CoVar
     go_var = tcIfaceLclId
 
-    go_axiom_rule :: FastString -> IfL CoAxiomRule
-    go_axiom_rule n =
-      case Map.lookup n typeNatCoAxiomRules of
-        Just ax -> return ax
-        _  -> pprPanic "go_axiom_rule" (ppr n)
-
 tcIfaceUnivCoProv :: IfaceUnivCoProv -> IfL UnivCoProvenance
 tcIfaceUnivCoProv IfaceUnsafeCoerceProv     = return UnsafeCoerceProv
 tcIfaceUnivCoProv (IfacePhantomProv kco)    = PhantomProv <$> tcIfaceCo kco
@@ -1808,6 +1802,16 @@ tcIfaceCoAxiom :: Name -> IfL (CoAxiom Branched)
 tcIfaceCoAxiom name = do { thing <- tcIfaceImplicit name
                          ; return (tyThingCoAxiom thing) }
 
+
+tcIfaceCoAxiomRule :: IfLclName -> IfL CoAxiomRule
+-- Unlike CoAxioms, which arise form user 'type instance' declarations,
+-- there are a fixed set of CoAxiomRules,
+-- currently enumerated in typeNatCoAxiomRules
+tcIfaceCoAxiomRule n
+  = case Map.lookup n typeNatCoAxiomRules of
+        Just ax -> return ax
+        _  -> pprPanic "tcIfaceCoAxiomRule" (ppr n)
+
 tcIfaceDataCon :: Name -> IfL DataCon
 tcIfaceDataCon name = do { thing <- tcIfaceGlobal name
                          ; case thing of
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 7b9cb13254..0d018a7ec4 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -59,6 +59,7 @@ module DynFlags (
         tablesNextToCode, mkTablesNextToCode,
         makeDynFlagsConsistent,
         shouldUseColor,
+        shouldUseHexWordLiterals,
         positionIndependent,
         optimisationFlags,
 
@@ -449,6 +450,7 @@ data GeneralFlag
    | Opt_KillOneShot
    | Opt_FullLaziness
    | Opt_FloatIn
+   | Opt_LateSpecialise
    | Opt_Specialise
    | Opt_SpecialiseAggressively
    | Opt_CrossModuleSpecialise
@@ -566,6 +568,7 @@ data GeneralFlag
    | Opt_NoSortValidSubstitutions
    | Opt_AbstractRefSubstitutions
    | Opt_ShowLoadedModules
+   | Opt_HexWordLiterals -- See Note [Print Hexadecimal Literals]
 
    -- Suppress all coercions, them replacing with '...'
    | Opt_SuppressCoercions
@@ -631,6 +634,7 @@ optimisationFlags = EnumSet.fromList
    , Opt_KillOneShot
    , Opt_FullLaziness
    , Opt_FloatIn
+   , Opt_LateSpecialise
    , Opt_Specialise
    , Opt_SpecialiseAggressively
    , Opt_CrossModuleSpecialise
@@ -1482,6 +1486,10 @@ data RtsOptsEnabled
 shouldUseColor :: DynFlags -> Bool
 shouldUseColor dflags = overrideWith (canUseColor dflags) (useColor dflags)
 
+shouldUseHexWordLiterals :: DynFlags -> Bool
+shouldUseHexWordLiterals dflags =
+  Opt_HexWordLiterals `EnumSet.member` generalFlags dflags
+
 -- | Are we building with @-fPIE@ or @-fPIC@ enabled?
 positionIndependent :: DynFlags -> Bool
 positionIndependent dflags = gopt Opt_PIC dflags || gopt Opt_PIE dflags
@@ -3007,6 +3015,8 @@ dynamic_flags_deps = [
         (NoArg (setRtsOptsEnabled RtsOptsNone))
   , make_ord_flag defGhcFlag "no-rtsopts-suggestions"
       (noArg (\d -> d {rtsOptsSuggestions = False}))
+  , make_ord_flag defGhcFlag "dhex-word-literals"
+        (NoArg (setGeneralFlag Opt_HexWordLiterals))
 
   , make_ord_flag defGhcFlag "ghcversion-file"      (hasArg addGhcVersionFile)
   , make_ord_flag defGhcFlag "main-is"              (SepArg setMainIs)
@@ -3929,6 +3939,7 @@ fFlagsDeps = [
   flagSpec "kill-absence"                     Opt_KillAbsence,
   flagSpec "kill-one-shot"                    Opt_KillOneShot,
   flagSpec "late-dmd-anal"                    Opt_LateDmdAnal,
+  flagSpec "late-specialise"                  Opt_LateSpecialise,
   flagSpec "liberate-case"                    Opt_LiberateCase,
   flagSpec "llvm-pass-vectors-in-regs"        Opt_LlvmPassVectorsInRegisters,
   flagHiddenSpec "llvm-tbaa"                  Opt_LlvmTBAA,
diff --git a/compiler/main/DynFlags.hs-boot b/compiler/main/DynFlags.hs-boot
index a8efb6013d..7440e5db00 100644
--- a/compiler/main/DynFlags.hs-boot
+++ b/compiler/main/DynFlags.hs-boot
@@ -5,13 +5,15 @@ import Platform
 
 data DynFlags
 data DumpFlag
+data GeneralFlag
 
-targetPlatform       :: DynFlags -> Platform
-pprUserLength        :: DynFlags -> Int
-pprCols              :: DynFlags -> Int
-unsafeGlobalDynFlags :: DynFlags
-useUnicode           :: DynFlags -> Bool
-useUnicodeSyntax     :: DynFlags -> Bool
-shouldUseColor       :: DynFlags -> Bool
-hasPprDebug          :: DynFlags -> Bool
-hasNoDebugOutput     :: DynFlags -> Bool
+targetPlatform           :: DynFlags -> Platform
+pprUserLength            :: DynFlags -> Int
+pprCols                  :: DynFlags -> Int
+unsafeGlobalDynFlags     :: DynFlags
+useUnicode               :: DynFlags -> Bool
+useUnicodeSyntax         :: DynFlags -> Bool
+shouldUseColor           :: DynFlags -> Bool
+shouldUseHexWordLiterals :: DynFlags -> Bool
+hasPprDebug              :: DynFlags -> Bool
+hasNoDebugOutput         :: DynFlags -> Bool
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index e2c568c836..7c345f2328 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -382,6 +382,14 @@ iselExpr64 (CmmMachOp (MO_UU_Conv W32 W64) [expr]) = do
         mov_lo = MR rlo expr_reg
     return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
                          rlo
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+    (expr_reg,expr_code) <- getSomeReg expr
+    (rlo, rhi) <- getNewRegPairNat II32
+    let mov_hi = SRA II32 rhi expr_reg (RIImm (ImmInt 31))
+        mov_lo = MR rlo expr_reg
+    return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+                         rlo
 iselExpr64 expr
    = pprPanic "iselExpr64(powerpc)" (pprExpr expr)
 
diff --git a/compiler/nativeGen/SPARC/CodeGen/Gen64.hs b/compiler/nativeGen/SPARC/CodeGen/Gen64.hs
index 8549fa07ee..6fa7482f9b 100644
--- a/compiler/nativeGen/SPARC/CodeGen/Gen64.hs
+++ b/compiler/nativeGen/SPARC/CodeGen/Gen64.hs
@@ -193,6 +193,24 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr])
 
         return  $ ChildCode64 code r_dst_lo
 
+-- only W32 supported for now
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr])
+ = do
+        r_dst_lo        <- getNewRegNat II32
+        let r_dst_hi    = getHiVRegFromLo r_dst_lo
+
+        -- compute expr and load it into r_dst_lo
+        (a_reg, a_code) <- getSomeReg expr
+
+        dflags          <- getDynFlags
+        let platform    = targetPlatform dflags
+            code        = a_code
+                `appOL` toOL
+                        [ SRA a_reg (RIImm (ImmInt 31)) r_dst_hi
+                        , mkRegRegMoveInstr platform a_reg r_dst_lo ]
+
+        return  $ ChildCode64 code r_dst_lo
+
 
 iselExpr64 expr
    = pprPanic "iselExpr64(sparc)" (ppr expr)
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 09757e769e..a0b0673d27 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -468,6 +468,20 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
                           r_dst_lo
             )
 
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL`
+                          MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+                          CLTD II32 `snocOL`
+                          MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+                          MOV II32 (OpReg edx) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
 iselExpr64 expr
    = pprPanic "iselExpr64(i386)" (ppr expr)
 
diff --git a/compiler/prelude/PrelRules.hs b/compiler/prelude/PrelRules.hs
index 73484b7c35..14e3f0f36e 100644
--- a/compiler/prelude/PrelRules.hs
+++ b/compiler/prelude/PrelRules.hs
@@ -942,7 +942,56 @@ dataToTagRule = a `mplus` b
 ************************************************************************
 -}
 
--- seq# :: forall a s . a -> State# s -> (# State# s, a #)
+{- Note [seq# magic]
+~~~~~~~~~~~~~~~~~~~~
+The primop
+   seq# :: forall a s . a -> State# s -> (# State# s, a #)
+
+is /not/ the same as the Prelude function seq :: a -> b -> b
+as you can see from its type.  In fact, seq# is the implementation
+mechanism for 'evaluate'
+
+   evaluate :: a -> IO a
+   evaluate a = IO $ \s -> seq# a s
+
+The semantics of seq# is
+  * evaluate its first argument
+  * and return it
+
+Things to note
+
+* Why do we need a primop at all?  That is, instead of
+      case seq# x s of (# x, s #) -> blah
+  why not instead say this?
+      case x of { DEFAULT -> blah)
+
+  Reason (see Trac #5129): if we saw
+    catch# (\s -> case x of { DEFAULT -> raiseIO# exn s }) handler
+
+  then we'd drop the 'case x' because the body of the case is bottom
+  anyway. But we don't want to do that; the whole /point/ of
+  seq#/evaluate is to evaluate 'x' first in the IO monad.
+
+  In short, we /always/ evaluate the first argument and never
+  just discard it.
+
+* Why return the value?  So that we can control sharing of seq'd
+  values: in
+     let x = e in x `seq` ... x ...
+  We don't want to inline x, so better to represent it as
+       let x = e in case seq# x RW of (# _, x' #) -> ... x' ...
+  also it matches the type of rseq in the Eval monad.
+
+Implementing seq#.  The compiler has magic for SeqOp in
+
+- PrelRules.seqRule: eliminate (seq# <whnf> s)
+
+- StgCmmExpr.cgExpr, and cgCase: special case for seq#
+
+- CoreUtils.exprOkForSpeculation;
+  see Note [seq# and expr_ok] in CoreUtils
+-}
+
 seqRule :: RuleM CoreExpr
 seqRule = do
   [Type ty_a, Type _ty_s, a, s] <- getArgs
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 038d350a76..996e0bb3e8 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -2697,13 +2697,7 @@ primop SparkOp "spark#" GenPrimOp
 
 primop SeqOp "seq#" GenPrimOp
    a -> State# s -> (# State# s, a #)
-
-   -- why return the value?  So that we can control sharing of seq'd
-   -- values: in
-   --    let x = e in x `seq` ... x ...
-   -- we don't want to inline x, so better to represent it as
-   --    let x = e in case seq# x RW of (# _, x' #) -> ... x' ...
-   -- also it matches the type of rseq in the Eval monad.
+   -- See Note [seq# magic] in PrelRules
 
 primop GetSparkOp "getSpark#" GenPrimOp
    State# s -> (# State# s, Int#, a #)
@@ -2942,6 +2936,20 @@ primop  TraceMarkerOp "traceMarker#" GenPrimOp
    has_side_effects = True
    out_of_line      = True
 
+primop  GetThreadAllocationCounter "getThreadAllocationCounter#" GenPrimOp
+   State# RealWorld -> (# State# RealWorld, INT64 #)
+   { Retrieves the allocation counter for the current thread. }
+   with
+   has_side_effects = True
+   out_of_line      = True
+
+primop  SetThreadAllocationCounter "setThreadAllocationCounter#" GenPrimOp
+   INT64 -> State# RealWorld -> State# RealWorld
+   { Sets the allocation counter for the current thread to the given value. }
+   with
+   has_side_effects = True
+   out_of_line      = True
+
 ------------------------------------------------------------------------
 section "Safe coercions"
 ------------------------------------------------------------------------
diff --git a/compiler/rename/RnNames.hs b/compiler/rename/RnNames.hs
index 769b34e45b..af00056271 100644
--- a/compiler/rename/RnNames.hs
+++ b/compiler/rename/RnNames.hs
@@ -1401,9 +1401,12 @@ warnUnusedImport flag fld_env (L loc decl, used, unused)
     pp_mod      = ppr (unLoc (ideclName decl))
     pp_not_used = text "is redundant"
 
+    -- In warning message, pretty-print identifiers unqualified unconditionally
+    -- to improve the consistent for ambiguous/unambiguous identifiers.
+    -- See trac#14881.
     ppr_possible_field n = case lookupNameEnv fld_env n of
-                               Just (fld, p) -> ppr p <> parens (ppr fld)
-                               Nothing  -> ppr n
+                               Just (fld, p) -> pprNameUnqualified p <> parens (ppr fld)
+                               Nothing  -> pprNameUnqualified n
 
     -- Print unused names in a deterministic (lexicographic) order
     sort_unused = pprWithCommas ppr_possible_field $
diff --git a/compiler/simplCore/SimplCore.hs b/compiler/simplCore/SimplCore.hs
index 41f0a9a495..2bea6dd05d 100644
--- a/compiler/simplCore/SimplCore.hs
+++ b/compiler/simplCore/SimplCore.hs
@@ -17,7 +17,8 @@ import CoreSyn
 import HscTypes
 import CSE              ( cseProgram )
 import Rules            ( mkRuleBase, unionRuleBase,
-                          extendRuleBaseList, ruleCheckProgram, addRuleInfo, )
+                          extendRuleBaseList, ruleCheckProgram, addRuleInfo,
+                          getRules )
 import PprCore          ( pprCoreBindings, pprCoreExpr )
 import OccurAnal        ( occurAnalysePgm, occurAnalyseExpr )
 import IdInfo
@@ -130,6 +131,7 @@ getCoreToDo dflags
     spec_constr   = gopt Opt_SpecConstr                   dflags
     liberate_case = gopt Opt_LiberateCase                 dflags
     late_dmd_anal = gopt Opt_LateDmdAnal                  dflags
+    late_specialise = gopt Opt_LateSpecialise             dflags
     static_args   = gopt Opt_StaticArgumentTransformation dflags
     rules_on      = gopt Opt_EnableRewriteRules           dflags
     eta_expand_on = gopt Opt_DoLambdaEtaExpansion         dflags
@@ -350,6 +352,10 @@ getCoreToDo dflags
 
         maybe_rule_check (Phase 0),
 
+        runWhen late_specialise
+          (CoreDoPasses [ CoreDoSpecialising
+                        , simpl_phase 0 ["post-late-spec"] max_iter]),
+
         -- Final clean-up simplification:
         simpl_phase 0 ["final"] max_iter,
 
@@ -520,10 +526,12 @@ ruleCheckPass current_phase pat guts =
     { rb <- getRuleBase
     ; dflags <- getDynFlags
     ; vis_orphs <- getVisibleOrphanMods
+    ; let rule_fn fn = getRules (RuleEnv rb vis_orphs) fn
+                        ++ (mg_rules guts)
     ; liftIO $ putLogMsg dflags NoReason Err.SevDump noSrcSpan
                    (defaultDumpStyle dflags)
                    (ruleCheckProgram current_phase pat
-                      (RuleEnv rb vis_orphs) (mg_binds guts))
+                      rule_fn (mg_binds guts))
     ; return guts }
 
 doPassDUM :: (DynFlags -> UniqSupply -> CoreProgram -> IO CoreProgram) -> ModGuts -> CoreM ModGuts
diff --git a/compiler/specialise/Rules.hs b/compiler/specialise/Rules.hs
index 319404ef15..b6025955ac 100644
--- a/compiler/specialise/Rules.hs
+++ b/compiler/specialise/Rules.hs
@@ -1148,10 +1148,10 @@ is so important.
 -- string for the purposes of error reporting
 ruleCheckProgram :: CompilerPhase               -- ^ Rule activation test
                  -> String                      -- ^ Rule pattern
-                 -> RuleEnv                     -- ^ Database of rules
+                 -> (Id -> [CoreRule])          -- ^ Rules for an Id
                  -> CoreProgram                 -- ^ Bindings to check in
                  -> SDoc                        -- ^ Resulting check message
-ruleCheckProgram phase rule_pat rule_base binds
+ruleCheckProgram phase rule_pat rules binds
   | isEmptyBag results
   = text "Rule check results: no rule application sites"
   | otherwise
@@ -1164,7 +1164,7 @@ ruleCheckProgram phase rule_pat rule_base binds
                        , rc_id_unf    = idUnfolding     -- Not quite right
                                                         -- Should use activeUnfolding
                        , rc_pattern   = rule_pat
-                       , rc_rule_base = rule_base }
+                       , rc_rules = rules }
     results = unionManyBags (map (ruleCheckBind env) binds)
     line = text (replicate 20 '-')
 
@@ -1172,7 +1172,7 @@ data RuleCheckEnv = RuleCheckEnv {
     rc_is_active :: Activation -> Bool,
     rc_id_unf  :: IdUnfoldingFun,
     rc_pattern :: String,
-    rc_rule_base :: RuleEnv
+    rc_rules :: Id -> [CoreRule]
 }
 
 ruleCheckBind :: RuleCheckEnv -> CoreBind -> Bag SDoc
@@ -1206,7 +1206,7 @@ ruleCheckFun env fn args
   | null name_match_rules = emptyBag
   | otherwise             = unitBag (ruleAppCheck_help env fn args name_match_rules)
   where
-    name_match_rules = filter match (getRules (rc_rule_base env) fn)
+    name_match_rules = filter match (rc_rules env fn)
     match rule = (rc_pattern env) `isPrefixOf` unpackFS (ruleName rule)
 
 ruleAppCheck_help :: RuleCheckEnv -> Id -> [CoreExpr] -> [CoreRule] -> SDoc
diff --git a/compiler/specialise/SpecConstr.hs b/compiler/specialise/SpecConstr.hs
index efd56ce77c..f62f7d0778 100644
--- a/compiler/specialise/SpecConstr.hs
+++ b/compiler/specialise/SpecConstr.hs
@@ -57,6 +57,7 @@ import UniqFM
 import MonadUtils
 import Control.Monad    ( zipWithM )
 import Data.List
+import Data.Maybe       ( fromMaybe )
 import PrelNames        ( specTyConName )
 import Module
 import TyCon ( TyCon )
@@ -1509,6 +1510,7 @@ data OneSpec =
   OS { os_pat  :: CallPat    -- Call pattern that generated this specialisation
      , os_rule :: CoreRule   -- Rule connecting original id with the specialisation
      , os_id   :: OutId      -- Spec id
+     , os_orig_id :: OutId   -- The original id
      , os_rhs  :: OutExpr }  -- Spec rhs
 
 noSpecInfo :: SpecInfo
@@ -1522,7 +1524,8 @@ specNonRec :: ScEnv
                                                --     plus details of specialisations
 
 specNonRec env body_usg rhs_info
-  = specialise env (scu_calls body_usg) rhs_info
+  = addPatUsages env (scu_calls body_usg) <$>
+    specialise env (scu_calls body_usg) rhs_info
                (noSpecInfo { si_mb_unspec = Just (ri_rhs_usg rhs_info) })
 
 ----------------------
@@ -1533,7 +1536,8 @@ specRec :: TopLevelFlag -> ScEnv
                                            --     plus details of specialisations
 
 specRec top_lvl env body_usg rhs_infos
-  = go 1 seed_calls nullUsage init_spec_infos
+  = addPatUsagess env (scu_calls body_usg) <$>
+    go 1 seed_calls nullUsage init_spec_infos
   where
     (seed_calls, init_spec_infos)    -- Note [Seeding top-level recursive groups]
        | isTopLevel top_lvl
@@ -1754,8 +1758,64 @@ spec_one env fn arg_bndrs body (call_pat@(qvars, pats), rule_number)
                            -- See Note [Transfer activation]
         ; return (spec_usg, OS { os_pat = call_pat, os_rule = rule
                                , os_id = spec_id
+                               , os_orig_id = fn
                                , os_rhs = spec_rhs }) }
 
+-- See Note [ArgOcc from calls to specialized functions]
+addPatUsagess :: ScEnv -> CallEnv -> (ScUsage, [SpecInfo]) -> (ScUsage, [SpecInfo])
+addPatUsagess env body_calls (usg, spec_infos) = (usg `combineUsage` extra_usages, spec_infos)
+  where extra_usages = combineUsages [ extraPatUsages env body_calls si | si <- spec_infos ]
+
+addPatUsages :: ScEnv -> CallEnv -> (ScUsage, SpecInfo) -> (ScUsage, SpecInfo)
+addPatUsages env body_calls (usg, spec_info) = (usg `combineUsage` extra_usage, spec_info)
+  where extra_usage = extraPatUsages env body_calls spec_info
+
+extraPatUsages :: ScEnv -> CallEnv -> SpecInfo -> ScUsage
+extraPatUsages env body_calls si  = combineUsages
+    [ patToCallUsage env call_pat call
+    | os <- si_specs si
+    , let fn  = os_orig_id os
+          call_pat = os_pat os
+    , pprTrace "add_pat_usages" (ppr fn <+> ppr call_pat)  True
+    , call <- fromMaybe [] $ lookupVarEnv body_calls fn
+    ]
+
+patToCallUsage :: ScEnv -> CallPat -> Call -> ScUsage
+patToCallUsage env (_qvars, pats) (Call _ args _)
+    = pprTrace "patToCallUsage" (ppr pats <+> ppr args <+> ppr usage) $
+      usage
+  where
+    usage = combineUsages $ zipWith go pats args
+
+    go :: CoreExpr -> CoreExpr -> ScUsage
+    -- The interesting case
+    go pat (Var v)
+        | Just RecArg <- lookupHowBound env v
+        , arg_occ@ScrutOcc{} <- patToArgOcc pat -- skip if we get UnkOcc
+        = nullUsage { scu_occs = unitVarEnv v arg_occ }
+
+    -- Transparent cases
+    go (Tick _ p) e = go p e
+    go (Cast p _) e = go p e
+    go p (Tick _ e) = go p e
+    go p (Cast e _) = go p e
+
+
+    -- Traverse the tree
+    go (App pf pa) (App f a)
+        = go pf f `combineUsage` go pa a
+
+    -- Boring catch-all
+    go _ _ = nullUsage
+
+patToArgOcc :: CoreExpr -> ArgOcc
+patToArgOcc e@App{}
+    | (Var f, args) <- collectArgs e
+    , Just dc <- isDataConWorkId_maybe f
+    = let arg_occs = [ patToArgOcc arg | arg <- args, not (isTypeArg arg) ]
+      in ScrutOcc $ unitUFM dc arg_occs
+patToArgOcc _
+    = UnkOcc
 
 -- See Note [Strictness information in worker binders]
 handOutStrictnessInformation :: [Demand] -> [Var] -> [Var]
@@ -1792,6 +1852,42 @@ calcSpecStrictness fn qvars pats
     go_one env _         _ = env
 
 {-
+Note [ArgOcc from calls to specialized functions]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We collect the ArgOcc to find out which parameters are being scrutinized in the
+body function, and only generate specializations when they would lead to some
+optimization: In
+
+  foo x = … case x of (a,b) -> …
+
+We are willing to specialize foo. If we have
+
+  foo x = … bar x …
+    where bar y = …
+
+we normally don’t. But what if we specialize bar? Then we have
+
+  foo x = … bar x …
+    where $sbar a b = …
+          bar y = …
+          {-# RULE forall a b. bar (a,b) = $sbar a b #-}
+
+and now it would be beneficial to create a specialized version of foo that
+calls $sbar directly.
+
+To achieve this, after we specialize bar, we look at the calls to it (found in
+scu_calls), and all the specializations that we created. If there is a call `bar x`
+and a specialization pattern `(x,y)`, then we treat that as if we found a case
+analysis of x, and include `x ↦ ScrutOcc` in scu_occs. This unblocks specialization
+of foo, and so on.
+
+(We might want to generalize this to any call to `baz x` where `baz` has
+rewrite rules that match on constructor arguments, not only for when _we_ _just_
+created specializations.)
+
+(See #14951)
+
 Note [spec_usg includes rhs_usg]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In calls to 'specialise', the returned ScUsage must include the rhs_usg in
diff --git a/compiler/typecheck/TcInstDcls.hs b/compiler/typecheck/TcInstDcls.hs
index 9f623fc0a5..7e44c4a32e 100644
--- a/compiler/typecheck/TcInstDcls.hs
+++ b/compiler/typecheck/TcInstDcls.hs
@@ -41,6 +41,7 @@ import TcUnify
 import CoreSyn    ( Expr(..), mkApps, mkVarApps, mkLams )
 import MkCore     ( nO_METHOD_BINDING_ERROR_ID )
 import CoreUnfold ( mkInlineUnfoldingWithArity, mkDFunUnfolding )
+import Kind
 import Type
 import TcEvidence
 import TyCon
@@ -680,7 +681,7 @@ tcDataFamInstDecl mb_clsinfo
          -- Deal with any kind signature.
          -- See also Note [Arity of data families] in FamInstEnv
        ; (extra_tcbs, final_res_kind) <- tcDataKindSig full_tcbs res_kind'
-       ; checkTc (isLiftedTypeKind final_res_kind) (badKindSig True res_kind')
+       ; checkTc (tcIsStarKind final_res_kind) (badKindSig True res_kind')
 
        ; let extra_pats  = map (mkTyVarTy . binderVar) extra_tcbs
              all_pats    = pats' `chkAppend` extra_pats
@@ -722,7 +723,7 @@ tcDataFamInstDecl mb_clsinfo
        ; checkValidFamPats mb_clsinfo fam_tc tvs' [] pats' extra_pats pp_hs_pats
 
          -- Result kind must be '*' (otherwise, we have too few patterns)
-       ; checkTc (isLiftedTypeKind final_res_kind) $
+       ; checkTc (tcIsStarKind final_res_kind) $
          tooFewParmsErr (tyConArity fam_tc)
 
        ; checkValidTyCon rep_tc
diff --git a/compiler/typecheck/TcTyClsDecls.hs b/compiler/typecheck/TcTyClsDecls.hs
index 7436b0d690..39697d6d94 100644
--- a/compiler/typecheck/TcTyClsDecls.hs
+++ b/compiler/typecheck/TcTyClsDecls.hs
@@ -886,7 +886,7 @@ tcFamDecl1 parent (FamilyDecl { fdInfo = fam_info, fdLName = tc_lname@(L _ tc_na
   -- Data families might have a variable return kind.
   -- See See Note [Arity of data families] in FamInstEnv.
   ; (extra_binders, final_res_kind) <- tcDataKindSig binders res_kind
-  ; checkTc (isLiftedTypeKind final_res_kind
+  ; checkTc (tcIsStarKind final_res_kind
              || isJust (tcGetCastedTyVar_maybe final_res_kind))
             (badKindSig False res_kind)
 
@@ -1038,7 +1038,7 @@ tcDataDefn roles_info
        ; let hsc_src = tcg_src tcg_env
        ; (extra_bndrs, final_res_kind) <- tcDataKindSig tycon_binders res_kind
        ; unless (mk_permissive_kind hsc_src cons) $
-         checkTc (isLiftedTypeKind final_res_kind) (badKindSig True res_kind)
+         checkTc (tcIsStarKind final_res_kind) (badKindSig True res_kind)
 
        ; let final_bndrs  = tycon_binders `chkAppend` extra_bndrs
              roles        = roles_info tc_name
diff --git a/compiler/typecheck/TcTypeNats.hs b/compiler/typecheck/TcTypeNats.hs
index 78e0b96e11..24e12cd15c 100644
--- a/compiler/typecheck/TcTypeNats.hs
+++ b/compiler/typecheck/TcTypeNats.hs
@@ -5,6 +5,9 @@ module TcTypeNats
   , typeNatCoAxiomRules
   , BuiltInSynFamily(..)
 
+    -- If you define a new built-in type family, make sure to export its TyCon
+    -- from here as well.
+    -- See Note [Adding built-in type families]
   , typeNatAddTyCon
   , typeNatMulTyCon
   , typeNatExpTyCon
@@ -53,10 +56,86 @@ import Data.Maybe ( isJust )
 import Control.Monad ( guard )
 import Data.List  ( isPrefixOf, isSuffixOf )
 
+{-
+Note [Type-level literals]
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+There are currently two forms of type-level literals: natural numbers, and
+symbols (even though this module is named TcTypeNats, it covers both).
+
+Type-level literals are supported by CoAxiomRules (conditional axioms), which
+power the built-in type families (see Note [Adding built-in type families]).
+Currently, all built-in type families are for the express purpose of supporting
+type-level literals.
+
+See also the Wiki page:
+
+    https://ghc.haskell.org/trac/ghc/wiki/TypeNats
+
+Note [Adding built-in type families]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There are a few steps to adding a built-in type family:
+
+* Adding a unique for the type family TyCon
+
+  These go in PrelNames. It will likely be of the form
+  @myTyFamNameKey = mkPreludeTyConUnique xyz@, where @xyz@ is a number that
+  has not been chosen before in PrelNames. There are several examples already
+  in PrelNames—see, for instance, typeNatAddTyFamNameKey.
+
+* Adding the type family TyCon itself
+
+  This goes in TcTypeNats. There are plenty of examples of how to define
+  these—see, for instance, typeNatAddTyCon.
+
+  Once your TyCon has been defined, be sure to:
+
+  - Export it from TcTypeNats. (Not doing so caused #14632.)
+  - Include it in the typeNatTyCons list, defined in TcTypeNats.
+
+* Exposing associated type family axioms
+
+  When defining the type family TyCon, you will need to define an axiom for
+  the type family in general (see, for instance, axAddDef), and perhaps other
+  auxiliary axioms for special cases of the type family (see, for instance,
+  axAdd0L and axAdd0R).
+
+  After you have defined all of these axioms, be sure to include them in the
+  typeNatCoAxiomRules list, defined in TcTypeNats.
+  (Not doing so caused #14934.)
+
+* Define the type family somewhere
+
+  Finally, you will need to define the type family somewhere, likely in @base@.
+  Currently, all of the built-in type families are defined in GHC.TypeLits or
+  GHC.TypeNats, so those are likely candidates.
+
+  Since the behavior of your built-in type family is specified in TcTypeNats,
+  you should give an open type family definition with no instances, like so:
+
+    type family MyTypeFam (m :: Nat) (n :: Nat) :: Nat
+
+  Changing the argument and result kinds as appropriate.
+
+* Update the relevant test cases
+
+  The GHC test suite will likely need to be updated after you add your built-in
+  type family. For instance:
+
+  - The T9181 test prints the :browse contents of GHC.TypeLits, so if you added
+    a test there, the expected output of T9181 will need to change.
+  - The TcTypeNatSimple and TcTypeSymbolSimple tests have compile-time unit
+    tests, as well as TcTypeNatSimpleRun and TcTypeSymbolSimpleRun, which have
+    runtime unit tests. Consider adding further unit tests to those if your
+    built-in type family deals with Nats or Symbols, respectively.
+-}
+
 {-------------------------------------------------------------------------------
 Built-in type constructors for functions on type-level nats
 -}
 
+-- The list of built-in type family TyCons that GHC uses.
+-- If you define a built-in type family, make sure to add it to this list.
+-- See Note [Adding built-in type families]
 typeNatTyCons :: [TyCon]
 typeNatTyCons =
   [ typeNatAddTyCon
@@ -266,6 +345,7 @@ Built-in rules axioms
 
 -- If you add additional rules, please remember to add them to
 -- `typeNatCoAxiomRules` also.
+-- See Note [Adding built-in type families]
 axAddDef
   , axMulDef
   , axExpDef
@@ -375,6 +455,9 @@ axAppendSymbol0R  = mkAxiom1 "Concat0R"
 axAppendSymbol0L  = mkAxiom1 "Concat0L"
             $ \(Pair s t) -> (s `appendSymbol` mkStrLitTy nilFS) === t
 
+-- The list of built-in type family axioms that GHC uses.
+-- If you define new axioms, make sure to include them in this list.
+-- See Note [Adding built-in type families]
 typeNatCoAxiomRules :: Map.Map FastString CoAxiomRule
 typeNatCoAxiomRules = Map.fromList $ map (\x -> (coaxrName x, x))
   [ axAddDef
@@ -398,6 +481,7 @@ typeNatCoAxiomRules = Map.fromList $ map (\x -> (coaxrName x, x))
   , axCmpSymbolRefl
   , axLeq0L
   , axSubDef
+  , axSub0R
   , axAppendSymbol0R
   , axAppendSymbol0L
   , axDivDef
diff --git a/compiler/types/TyCoRep.hs b/compiler/types/TyCoRep.hs
index 588963d012..cc425991ae 100644
--- a/compiler/types/TyCoRep.hs
+++ b/compiler/types/TyCoRep.hs
@@ -814,16 +814,17 @@ data Coercion
      -- any left over, we use AppCo.
      -- See [Coercion axioms applied to coercions]
 
+  | AxiomRuleCo CoAxiomRule [Coercion]
+    -- AxiomRuleCo is very like AxiomInstCo, but for a CoAxiomRule
+    -- The number coercions should match exactly the expectations
+    -- of the CoAxiomRule (i.e., the rule is fully saturated).
+
   | UnivCo UnivCoProvenance Role Type Type
       -- :: _ -> "e" -> _ -> _ -> e
 
   | SymCo Coercion             -- :: e -> e
   | TransCo Coercion Coercion  -- :: e -> e -> e
 
-    -- The number coercions should match exactly the expectations
-    -- of the CoAxiomRule (i.e., the rule is fully saturated).
-  | AxiomRuleCo CoAxiomRule [Coercion]
-
   | NthCo  Int         Coercion     -- Zero-indexed; decomposes (T t0 ... tn)
     -- :: _ -> e -> ?? (inverse of TyConAppCo, see Note [TyConAppCo roles])
     -- Using NthCo on a ForAllCo gives an N coercion always
diff --git a/compiler/utils/Outputable.hs b/compiler/utils/Outputable.hs
index 793b8fb139..2b03555bab 100644
--- a/compiler/utils/Outputable.hs
+++ b/compiler/utils/Outputable.hs
@@ -22,7 +22,7 @@ module Outputable (
         empty, isEmpty, nest,
         char,
         text, ftext, ptext, ztext,
-        int, intWithCommas, integer, float, double, rational, doublePrec,
+        int, intWithCommas, integer, word, float, double, rational, doublePrec,
         parens, cparen, brackets, braces, quotes, quote,
         doubleQuotes, angleBrackets, paBrackets,
         semi, comma, colon, dcolon, space, equals, dot, vbar,
@@ -91,7 +91,8 @@ import GhcPrelude
 import {-# SOURCE #-}   DynFlags( DynFlags, hasPprDebug, hasNoDebugOutput,
                                   targetPlatform, pprUserLength, pprCols,
                                   useUnicode, useUnicodeSyntax,
-                                  shouldUseColor, unsafeGlobalDynFlags )
+                                  shouldUseColor, unsafeGlobalDynFlags,
+                                  shouldUseHexWordLiterals )
 import {-# SOURCE #-}   Module( UnitId, Module, ModuleName, moduleName )
 import {-# SOURCE #-}   OccName( OccName )
 
@@ -555,6 +556,7 @@ ptext    :: LitString  -> SDoc
 ztext    :: FastZString -> SDoc
 int      :: Int        -> SDoc
 integer  :: Integer    -> SDoc
+word     :: Integer    -> SDoc
 float    :: Float      -> SDoc
 double   :: Double     -> SDoc
 rational :: Rational   -> SDoc
@@ -573,6 +575,11 @@ integer n   = docToSDoc $ Pretty.integer n
 float n     = docToSDoc $ Pretty.float n
 double n    = docToSDoc $ Pretty.double n
 rational n  = docToSDoc $ Pretty.rational n
+word n      = sdocWithDynFlags $ \dflags ->
+    -- See Note [Print Hexadecimal Literals] in Pretty.hs
+    if shouldUseHexWordLiterals dflags
+        then docToSDoc $ Pretty.hex n
+        else docToSDoc $ Pretty.integer n
 
 -- | @doublePrec p n@ shows a floating point number @n@ with @p@
 -- digits of precision after the decimal point.
@@ -969,9 +976,9 @@ pprPrimChar :: Char -> SDoc
 pprPrimInt, pprPrimWord, pprPrimInt64, pprPrimWord64 :: Integer -> SDoc
 pprPrimChar c   = pprHsChar c <> primCharSuffix
 pprPrimInt i    = integer i   <> primIntSuffix
-pprPrimWord w   = integer w   <> primWordSuffix
+pprPrimWord w   = word    w   <> primWordSuffix
 pprPrimInt64 i  = integer i   <> primInt64Suffix
-pprPrimWord64 w = integer w   <> primWord64Suffix
+pprPrimWord64 w = word    w   <> primWord64Suffix
 
 ---------------------
 -- Put a name in parens if it's an operator
diff --git a/compiler/utils/Pretty.hs b/compiler/utils/Pretty.hs
index f4987d3751..9a12c7dae9 100644
--- a/compiler/utils/Pretty.hs
+++ b/compiler/utils/Pretty.hs
@@ -72,7 +72,7 @@ module Pretty (
 
         -- ** Converting values into documents
         char, text, ftext, ptext, ztext, sizedText, zeroWidthText,
-        int, integer, float, double, rational,
+        int, integer, float, double, rational, hex,
 
         -- ** Simple derived documents
         semi, comma, colon, space, equals,
@@ -117,6 +117,7 @@ import BufWrite
 import FastString
 import Panic
 import System.IO
+import Numeric (showHex)
 
 --for a RULES
 import GHC.Base ( unpackCString# )
@@ -404,11 +405,18 @@ integer  :: Integer  -> Doc -- ^ @integer n = text (show n)@
 float    :: Float    -> Doc -- ^ @float n = text (show n)@
 double   :: Double   -> Doc -- ^ @double n = text (show n)@
 rational :: Rational -> Doc -- ^ @rational n = text (show n)@
+hex      :: Integer  -> Doc -- ^ See Note [Print Hexadecimal Literals]
 int      n = text (show n)
 integer  n = text (show n)
 float    n = text (show n)
 double   n = text (show n)
 rational n = text (show n)
+hex      n = text ('0' : 'x' : padded)
+    where
+    str = showHex n ""
+    strLen = max 1 (length str)
+    len = 2 ^ (ceiling (logBase 2 (fromIntegral strLen :: Double)) :: Int)
+    padded = replicate (len - strLen) '0' ++ str
 
 parens       :: Doc -> Doc -- ^ Wrap document in @(...)@
 brackets     :: Doc -> Doc -- ^ Wrap document in @[...]@
@@ -423,6 +431,57 @@ parens p       = char '(' <> p <> char ')'
 brackets p     = char '[' <> p <> char ']'
 braces p       = char '{' <> p <> char '}'
 
+{-
+Note [Print Hexadecimal Literals]
+
+Relevant discussions:
+ * Phabricator: https://phabricator.haskell.org/D4465
+ * GHC Trac: https://ghc.haskell.org/trac/ghc/ticket/14872
+
+There is a flag `-dword-hex-literals` that causes literals of
+type `Word#` or `Word64#` to be displayed in hexadecimal instead
+of decimal when dumping GHC core. It also affects the presentation
+of these in GHC's error messages. Additionally, the hexadecimal
+encoding of these numbers is zero-padded so that its length is
+a power of two. As an example of what this does,
+consider the following haskell file `Literals.hs`:
+
+    module Literals where
+
+    alpha :: Int
+    alpha = 100 + 200
+
+    beta :: Word -> Word
+    beta x = x + div maxBound 255 + div 0xFFFFFFFF 255 + 0x0202
+
+We get the following dumped core when we compile on a 64-bit
+machine with ghc -O2 -fforce-recomp -ddump-simpl -dsuppress-all
+-dhex-word-literals literals.hs:
+
+    ==================== Tidy Core ====================
+
+    ... omitted for brevity ...
+
+    -- RHS size: {terms: 2, types: 0, coercions: 0, joins: 0/0}
+    alpha
+    alpha = I# 300#
+
+    -- RHS size: {terms: 12, types: 3, coercions: 0, joins: 0/0}
+    beta
+    beta
+      = \ x_aYE ->
+          case x_aYE of { W# x#_a1v0 ->
+          W#
+            (plusWord#
+               (plusWord# (plusWord# x#_a1v0 0x0101010101010101##) 0x01010101##)
+               0x0202##)
+          }
+
+Notice that the word literals are in hexadecimals and that they have
+been padded with zeroes so that their lengths are 16, 8, and 4, respectively.
+
+-}
+
 -- | Apply 'parens' to 'Doc' if boolean is true.
 maybeParens :: Bool -> Doc -> Doc
 maybeParens False = id
diff --git a/compiler/utils/UniqFM.hs b/compiler/utils/UniqFM.hs
index 2a9b806178..a80880f4e5 100644
--- a/compiler/utils/UniqFM.hs
+++ b/compiler/utils/UniqFM.hs
@@ -78,12 +78,10 @@ import Outputable
 import Data.List (foldl')
 
 import qualified Data.IntMap as M
-import qualified Data.IntMap.Merge.Lazy as M
-import Control.Applicative (Const (..))
-import qualified Data.Monoid as Mon
 import qualified Data.IntSet as S
 import Data.Data
 import qualified Data.Semigroup as Semi
+import Data.Functor.Classes (Eq1 (..))
 
 
 newtype UniqFM ele = UFM (M.IntMap ele)
@@ -342,10 +340,7 @@ ufmToIntMap (UFM m) = m
 
 -- Determines whether two 'UniqFm's contain the same keys.
 equalKeysUFM :: UniqFM a -> UniqFM b -> Bool
-equalKeysUFM (UFM m1) (UFM m2) = Mon.getAll $ getConst $
-      M.mergeA (M.traverseMissing (\_ _ -> Const (Mon.All False)))
-               (M.traverseMissing (\_ _ -> Const (Mon.All False)))
-               (M.zipWithAMatched (\_ _ _ -> Const (Mon.All True))) m1 m2
+equalKeysUFM (UFM m1) (UFM m2) = liftEq (\_ _ -> True) m1 m2
 
 -- Instances
 
diff --git a/configure.ac b/configure.ac
index e75fc6c7dd..1bab56d7fe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -55,13 +55,14 @@ AC_SUBST([release], [1])
 # First off, a distrib sanity check..
 AC_CONFIG_SRCDIR([mk/config.mk.in])
 
-dnl * We require autoconf version 2.60
-dnl We need 2.50 due to the use of AC_SYS_LARGEFILE and AC_MSG_NOTICE.
-dnl We need 2.52 due to the use of AS_TR_CPP and AS_TR_SH.
-dnl Using autoconf 2.59 started to give nonsense like this
-dnl  #define SIZEOF_CHAR 0
-dnl recently.
-AC_PREREQ([2.60])
+dnl * We require autoconf version 2.69 due to
+dnl   https://bugs.ruby-lang.org/issues/8179. Also see #14910.
+dnl * We need 2.50 due to the use of AC_SYS_LARGEFILE and AC_MSG_NOTICE.
+dnl * We need 2.52 due to the use of AS_TR_CPP and AS_TR_SH.
+dnl * Using autoconf 2.59 started to give nonsense like this
+dnl     #define SIZEOF_CHAR 0
+dnl   recently.
+AC_PREREQ([2.69])
 
 # -------------------------------------------------------------------------
 # Prepare to generate the following header files
diff --git a/distrib/configure.ac.in b/distrib/configure.ac.in
index 95ad1986bb..ed1c29671a 100644
--- a/distrib/configure.ac.in
+++ b/distrib/configure.ac.in
@@ -6,6 +6,9 @@ dnl
 
 AC_INIT([The Glorious Glasgow Haskell Compilation System], [@ProjectVersion@], [glasgow-haskell-bugs@haskell.org], [ghc-AC_PACKAGE_VERSION])
 
+dnl See /configure.ac for rationale.
+AC_PREREQ([2.69])
+
 dnl--------------------------------------------------------------------
 dnl * Deal with arguments telling us gmp is somewhere odd
 dnl--------------------------------------------------------------------
diff --git a/docs/users_guide/debugging.rst b/docs/users_guide/debugging.rst
index d11cc04fd0..c6d90e642d 100644
--- a/docs/users_guide/debugging.rst
+++ b/docs/users_guide/debugging.rst
@@ -588,6 +588,20 @@ Formatting dumps
     let expressions. This is helpful when your code does a lot of
     unboxing.
 
+.. ghc-flag:: -dhex-word-literals
+    :shortdesc: Print values of type `Word#` in hexadecimal.
+    :type: dynamic
+
+    Print values of type `Word#` and `Word64#` (but not values of
+    type `Int#` and `Int64#`) in hexadecimal instead of decimal.
+    The hexadecimal is zero-padded to make the length of the
+    representation a power of two. For example: `0x0A0A##`,
+    `0x000FFFFF##`, `0xC##`. This flag may be helpful when you
+    are producing a bit pattern that to expect to work correctly on a 32-bit
+    or a 64-bit architecture. Dumping hexadecimal literals after
+    optimizations and constant folding makes it easier to confirm
+    that the generated bit pattern is correct.
+
 .. ghc-flag:: -dno-debug-output
     :shortdesc: Suppress unsolicited debugging output
     :type: dynamic
diff --git a/docs/users_guide/ghc_packages.py b/docs/users_guide/ghc_packages.py
index d4a688b370..6419834e1e 100644
--- a/docs/users_guide/ghc_packages.py
+++ b/docs/users_guide/ghc_packages.py
@@ -8,13 +8,13 @@ from utils import build_table_from_list
 def read_cabal_file(pkg_path):
     import re
     cabal_file = open(pkg_path, 'r').read()
-    pkg_name = re.search(r'[nN]ame:\s*([-a-zA-Z0-9]+)', cabal_file)
+    pkg_name = re.search(r'^[nN]ame\s*:\s*([-a-zA-Z0-9]+)', cabal_file, re.MULTILINE)
     if pkg_name is not None:
         pkg_name = pkg_name.group(1)
     else:
         raise RuntimeError("Failed to parse `Name:` field from %s" % pkg_path)
 
-    pkg_version = re.search(r'[vV]ersion:\s*(\d+(\.\d+)*)', cabal_file)
+    pkg_version = re.search(r'^[vV]ersion\s*:\s*(\d+(\.\d+)*)', cabal_file, re.MULTILINE)
     if pkg_version is not None:
         pkg_version = pkg_version.group(1)
     else:
diff --git a/docs/users_guide/glasgow_exts.rst b/docs/users_guide/glasgow_exts.rst
index c6cff92790..49c6ed4709 100644
--- a/docs/users_guide/glasgow_exts.rst
+++ b/docs/users_guide/glasgow_exts.rst
@@ -9442,7 +9442,7 @@ The following things have kind ``Constraint``:
 -  Anything whose form is not yet known, but the user has declared to
    have kind ``Constraint`` (for which they need to import it from
    ``GHC.Exts``). So for example
-   ``type Foo (f :: \* -> Constraint) = forall b. f b => b -> b``
+   ``type Foo (f :: * -> Constraint) = forall b. f b => b -> b``
    is allowed, as well as examples involving type families: ::
 
        type family Typ a b :: Constraint
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 009e3ae887..92bc739dfe 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -776,6 +776,7 @@ RTS options to produce runtime statistics
               -s [⟨file⟩]
               -S [⟨file⟩]
               --machine-readable
+              --internal-counters
 
     These options produce runtime-system statistics, such as the amount
     of time spent executing the program and in the garbage collector,
@@ -785,7 +786,10 @@ RTS options to produce runtime statistics
     line of output in the same format as GHC's ``-Rghc-timing`` option,
     ``-s`` produces a more detailed summary at the end of the program,
     and ``-S`` additionally produces information about each and every
-    garbage collection.
+    garbage collection. Passing ``--internal-counters`` to a threaded
+    runtime will cause a detailed summary to include various internal
+    counts accumulated during the run; note that these are unspecified
+    and may change between releases.
 
     The output is placed in ⟨file⟩. If ⟨file⟩ is omitted, then the
     output is sent to ``stderr``.
diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst
index 3566462eeb..d6c24de502 100644
--- a/docs/users_guide/using-optimisation.rst
+++ b/docs/users_guide/using-optimisation.rst
@@ -884,6 +884,20 @@ by saying ``-fno-wombat``.
     which they are called in this module. Note that specialisation must be
     enabled (by ``-fspecialise``) for this to have any effect.
 
+.. ghc-flag:: -flate-specialise
+    :shortdesc: Run a late specialisation pass
+    :type: dynamic
+    :reverse: -fno-late-specialise
+    :default: off
+
+    Runs another specialisation pass towards the end of the optimisation
+    pipeline. This can catch specialisation opportunities which arose from
+    the previous specialisation pass or other inlining.
+
+    You might want to use this if you are you have a type class method
+    which returns a constrained type. For example, a type class where one
+    of the methods implements a traversal.
+
 .. ghc-flag:: -fsolve-constant-dicts
     :shortdesc: When solving constraints, try to eagerly solve
         super classes using available dictionaries.
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 57d78ccaa5..18b2aaf324 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -161,9 +161,11 @@
 
 /* TO_W_(n) converts n to W_ type from a smaller type */
 #if SIZEOF_W == 4
+#define TO_I64(x) %sx64(x)
 #define TO_W_(x) %sx32(x)
 #define HALF_W_(x) %lobits16(x)
 #elif SIZEOF_W == 8
+#define TO_I64(x) (x)
 #define TO_W_(x) %sx64(x)
 #define HALF_W_(x) %lobits32(x)
 #endif
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index 27a5080220..6f011cbf6e 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -210,6 +210,30 @@ typedef struct _RTSStats {
 
   GCDetails gc;
 
+  // -----------------------------------
+  // Internal Counters
+
+  // The number of times a GC thread spun on its 'gc_spin' lock.
+  // Will be zero if the rts was not built with PROF_SPIN
+  uint64_t gc_spin_spin;
+  // The number of times a GC thread yielded on its 'gc_spin' lock.
+  // Will be zero if the rts was not built with PROF_SPIN
+  uint64_t gc_spin_yield;
+  // The number of times a GC thread spun on its 'mut_spin' lock.
+  // Will be zero if the rts was not built with PROF_SPIN
+  uint64_t mut_spin_spin;
+  // The number of times a GC thread yielded on its 'mut_spin' lock.
+  // Will be zero if the rts was not built with PROF_SPIN
+  uint64_t mut_spin_yield;
+  // The number of times a GC thread has checked for work across all parallel
+  // GCs
+  uint64_t any_work;
+  // The number of times a GC thread has checked for work and found none across
+  // all parallel GCs
+  uint64_t no_work;
+  // The number of times a GC thread has iterated it's outer loop across all
+  // parallel GCs
+  uint64_t scav_find_work;
 } RTSStats;
 
 void getRTSStats (RTSStats *s);
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index aed4dca384..6487947749 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -195,6 +195,7 @@ typedef struct _MISC_FLAGS {
     bool generate_dump_file;
     bool generate_stack_trace;
     bool machineReadable;
+    bool internalCounters;       /* See Note [Internal Counter Stats] */
     StgWord linkerMemBase;       /* address to ask the OS for memory
                                   * for the linker, NULL ==> off */
 } MISC_FLAGS;
diff --git a/includes/rts/SpinLock.h b/includes/rts/SpinLock.h
index 6530a3a2f0..1dca02f795 100644
--- a/includes/rts/SpinLock.h
+++ b/includes/rts/SpinLock.h
@@ -27,7 +27,8 @@
 typedef struct SpinLock_
 {
     StgWord   lock;
-    StgWord64 spin; // DEBUG version counts how much it spins
+    StgWord64 spin;  // incremented every time we spin in ACQUIRE_SPIN_LOCK
+    StgWord64 yield; // incremented every time we yield in ACQUIRE_SPIN_LOCK
 } SpinLock;
 #else
 typedef StgWord SpinLock;
@@ -49,6 +50,7 @@ INLINE_HEADER void ACQUIRE_SPIN_LOCK(SpinLock * p)
             p->spin++;
             busy_wait_nop();
         }
+        p->yield++;
         yieldThread();
     } while (1);
 }
@@ -66,6 +68,7 @@ INLINE_HEADER void initSpinLock(SpinLock * p)
     write_barrier();
     p->lock = 1;
     p->spin = 0;
+    p->yield = 0;
 }
 
 #else
diff --git a/includes/rts/Threads.h b/includes/rts/Threads.h
index fceacdc75d..f72f5ed121 100644
--- a/includes/rts/Threads.h
+++ b/includes/rts/Threads.h
@@ -43,8 +43,6 @@ StgRegTable * resumeThread  (void *);
 //
 int     cmp_thread                       (StgPtr tso1, StgPtr tso2);
 int     rts_getThreadId                  (StgPtr tso);
-HsInt64 rts_getThreadAllocationCounter   (StgPtr tso);
-void    rts_setThreadAllocationCounter   (StgPtr tso, HsInt64 i);
 void    rts_enableThreadAllocationLimit  (StgPtr tso);
 void    rts_disableThreadAllocationLimit (StgPtr tso);
 
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 2aed7c57ee..d4182dd7f9 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -120,7 +120,7 @@ typedef struct generation_ {
     // stats information
     uint32_t collections;
     uint32_t par_collections;
-    uint32_t failed_promotions;
+    uint32_t failed_promotions;         // Currently unused
 
     // ------------------------------------
     // Fields below are used during GC only
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 76cfbd6c8c..1fbfab9fbe 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -468,6 +468,9 @@ RTS_FUN_DECL(stg_traceCcszh);
 RTS_FUN_DECL(stg_clearCCSzh);
 RTS_FUN_DECL(stg_traceEventzh);
 RTS_FUN_DECL(stg_traceMarkerzh);
+RTS_FUN_DECL(stg_getThreadAllocationCounterzh);
+RTS_FUN_DECL(stg_setThreadAllocationCounterzh);
+
 
 /* Other misc stuff */
 // See wiki:Commentary/Compiler/Backends/PprC#Prototypes
diff --git a/libraries/array b/libraries/array
-Subproject 9d63218fd067ff4885c0efa43b388238421a5c8
+Subproject 1d0435f4937f03901e32304e279f46ce19b0f08
diff --git a/libraries/base/GHC/Conc/Sync.hs b/libraries/base/GHC/Conc/Sync.hs
index 517c20e45f..94601f356d 100644
--- a/libraries/base/GHC/Conc/Sync.hs
+++ b/libraries/base/GHC/Conc/Sync.hs
@@ -105,6 +105,7 @@ import Data.Maybe
 import GHC.Base
 import {-# SOURCE #-} GHC.IO.Handle ( hFlush )
 import {-# SOURCE #-} GHC.IO.Handle.FD ( stdout )
+import GHC.Int
 import GHC.IO
 import GHC.IO.Encoding.UTF8
 import GHC.IO.Exception
@@ -194,18 +195,16 @@ instance Ord ThreadId where
 --
 -- @since 4.8.0.0
 setAllocationCounter :: Int64 -> IO ()
-setAllocationCounter i = do
-  ThreadId t <- myThreadId
-  rts_setThreadAllocationCounter t i
+setAllocationCounter (I64# i) = IO $ \s ->
+  case setThreadAllocationCounter# i s of s' -> (# s', () #)
 
 -- | Return the current value of the allocation counter for the
 -- current thread.
 --
 -- @since 4.8.0.0
 getAllocationCounter :: IO Int64
-getAllocationCounter = do
-  ThreadId t <- myThreadId
-  rts_getThreadAllocationCounter t
+getAllocationCounter = IO $ \s ->
+  case getThreadAllocationCounter# s of (# s', ctr #) -> (# s', I64# ctr #)
 
 -- | Enables the allocation counter to be treated as a limit for the
 -- current thread.  When the allocation limit is enabled, if the
@@ -242,16 +241,6 @@ disableAllocationLimit = do
   ThreadId t <- myThreadId
   rts_disableThreadAllocationLimit t
 
--- We cannot do these operations safely on another thread, because on
--- a 32-bit machine we cannot do atomic operations on a 64-bit value.
--- Therefore, we only expose APIs that allow getting and setting the
--- limit of the current thread.
-foreign import ccall unsafe "rts_setThreadAllocationCounter"
-  rts_setThreadAllocationCounter :: ThreadId# -> Int64 -> IO ()
-
-foreign import ccall unsafe "rts_getThreadAllocationCounter"
-  rts_getThreadAllocationCounter :: ThreadId# -> IO Int64
-
 foreign import ccall unsafe "rts_enableThreadAllocationLimit"
   rts_enableThreadAllocationLimit :: ThreadId# -> IO ()
 
diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc
index 3e712ca900..046975577e 100644
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -138,6 +138,7 @@ data MiscFlags = MiscFlags
     , generateCrashDumpFile :: Bool
     , generateStackTrace    :: Bool
     , machineReadable       :: Bool
+    , internalCounters      :: Bool
     , linkerMemBase         :: Word
       -- ^ address to ask the OS for memory for the linker, 0 ==> off
     } deriving ( Show -- ^ @since 4.8.0.0
@@ -441,6 +442,8 @@ getMiscFlags = do
                   (#{peek MISC_FLAGS, generate_stack_trace} ptr :: IO CBool))
             <*> (toBool <$>
                   (#{peek MISC_FLAGS, machineReadable} ptr :: IO CBool))
+            <*> (toBool <$>
+                  (#{peek MISC_FLAGS, internalCounters} ptr :: IO CBool))
             <*> #{peek MISC_FLAGS, linkerMemBase} ptr
 
 getDebugFlags :: IO DebugFlags
diff --git a/libraries/integer-gmp/configure.ac b/libraries/integer-gmp/configure.ac
index c19dbbc4a8..3aebeba5d5 100644
--- a/libraries/integer-gmp/configure.ac
+++ b/libraries/integer-gmp/configure.ac
@@ -1,4 +1,4 @@
-AC_PREREQ(2.60)
+AC_PREREQ(2.69)
 AC_INIT([Haskell integer (GMP)], [1.0], [libraries@haskell.org], [integer])
 
 # Safety check: Ensure that we are in the correct source directory.
diff --git a/rts/Messages.c b/rts/Messages.c
index 8fab314bc4..a9c794d823 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -129,6 +129,9 @@ loop:
     }
     else if (i == &stg_WHITEHOLE_info)
     {
+#if defined(PROF_SPIN)
+        ++whitehole_executeMessage_spin;
+#endif
         goto loop;
     }
     else
diff --git a/rts/Messages.h b/rts/Messages.h
index e60f19dc1d..18371564c4 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -31,3 +31,7 @@ doneWithMsgThrowTo (MessageThrowTo *m)
 }
 
 #include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_executeMessage_spin;
+#endif
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 67a238488c..e3f6e4cd19 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -2491,3 +2491,23 @@ stg_traceMarkerzh ( W_ msg )
     return ();
 }
 
+
+stg_getThreadAllocationCounterzh ()
+{
+    // Account for the allocation in the current block
+    W_ offset;
+    offset = Hp - bdescr_start(CurrentNursery);
+    return (StgTSO_alloc_limit(CurrentTSO) - TO_I64(offset));
+}
+
+stg_setThreadAllocationCounterzh ( I64 counter )
+{
+    // Allocation in the current block will be subtracted by
+    // getThreadAllocationCounter#, so we have to offset any existing
+    // allocation here.  See also openNursery/closeNursery in
+    // compiler/codeGen/StgCmmForeign.hs.
+    W_ offset;
+    offset = Hp - bdescr_start(CurrentNursery);
+    StgTSO_alloc_limit(CurrentTSO) = counter + TO_I64(offset);
+    return ();
+}
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 7b38bbd6fd..b674e9b685 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -232,6 +232,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.MiscFlags.generate_stack_trace    = true;
     RtsFlags.MiscFlags.generate_dump_file      = false;
     RtsFlags.MiscFlags.machineReadable         = false;
+    RtsFlags.MiscFlags.internalCounters        = false;
     RtsFlags.MiscFlags.linkerMemBase           = 0;
 
 #if defined(THREADED_RTS)
@@ -888,6 +889,11 @@ error = true;
                       OPTION_UNSAFE;
                       RtsFlags.MiscFlags.machineReadable = true;
                   }
+                  else if (strequal("internal-counters",
+                                    &rts_argv[arg][2])) {
+                      OPTION_SAFE;
+                      RtsFlags.MiscFlags.internalCounters = true;
+                  }
                   else if (strequal("info",
                                &rts_argv[arg][2])) {
                       OPTION_SAFE;
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index e53a056a4c..d5800fd336 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -743,8 +743,6 @@
       SymI_HasProto(rts_isProfiled)                                     \
       SymI_HasProto(rts_isDynamic)                                      \
       SymI_HasProto(rts_setInCallCapability)                            \
-      SymI_HasProto(rts_getThreadAllocationCounter)                     \
-      SymI_HasProto(rts_setThreadAllocationCounter)                     \
       SymI_HasProto(rts_enableThreadAllocationLimit)                    \
       SymI_HasProto(rts_disableThreadAllocationLimit)                   \
       SymI_HasProto(rts_setMainThread)                                  \
@@ -895,6 +893,8 @@
       SymI_HasProto(stg_traceCcszh)                                     \
       SymI_HasProto(stg_traceEventzh)                                   \
       SymI_HasProto(stg_traceMarkerzh)                                  \
+      SymI_HasProto(stg_getThreadAllocationCounterzh)                   \
+      SymI_HasProto(stg_setThreadAllocationCounterzh)                   \
       SymI_HasProto(getMonotonicNSec)                                   \
       SymI_HasProto(lockFile)                                           \
       SymI_HasProto(unlockFile)                                         \
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
index 4ea1c55976..1d18e1b018 100644
--- a/rts/SMPClosureOps.h
+++ b/rts/SMPClosureOps.h
@@ -38,6 +38,11 @@ EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
 
 #if defined(THREADED_RTS)
 
+#if defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_lockClosure_spin;
+extern volatile StgWord64 whitehole_lockClosure_yield;
+#endif
+
 /* -----------------------------------------------------------------------------
  * Locking/unlocking closures
  *
@@ -56,7 +61,14 @@ EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
         do {
             info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
             if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+#if defined(PROF_SPIN)
+            ++whitehole_lockClosure_spin;
+#endif
+            busy_wait_nop();
         } while (++i < SPIN_COUNT);
+#if defined(PROF_SPIN)
+        ++whitehole_lockClosure_yield;
+#endif
         yieldThread();
     } while (1);
 }
diff --git a/rts/Stats.c b/rts/Stats.c
index 26bdac0ea5..7eb93bef33 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -16,10 +16,15 @@
 #include "Profiling.h"
 #include "GetTime.h"
 #include "sm/Storage.h"
-#include "sm/GC.h" // gc_alloc_block_sync, whitehole_gc_spin
 #include "sm/GCThread.h"
 #include "sm/BlockAlloc.h"
 
+// for spin/yield counters
+#include "sm/GC.h"
+#include "ThreadPaused.h"
+#include "Messages.h"
+
+
 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
 static Time
@@ -43,6 +48,13 @@ static Time HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
 #define PROF_VAL(x)   0
 #endif
 
+#if defined(PROF_SPIN)
+volatile StgWord64 whitehole_lockClosure_spin = 0;
+volatile StgWord64 whitehole_lockClosure_yield = 0;
+volatile StgWord64 whitehole_threadPaused_spin = 0;
+volatile StgWord64 whitehole_executeMessage_spin = 0;
+#endif
+
 //
 // All the stats!
 //
@@ -150,6 +162,13 @@ initStats0(void)
         .par_copied_bytes = 0,
         .cumulative_par_max_copied_bytes = 0,
         .cumulative_par_balanced_copied_bytes = 0,
+        .gc_spin_spin = 0,
+        .gc_spin_yield = 0,
+        .mut_spin_spin = 0,
+        .mut_spin_yield = 0,
+        .any_work = 0,
+        .no_work = 0,
+        .scav_find_work = 0,
         .mutator_cpu_ns = 0,
         .mutator_elapsed_ns = 0,
         .gc_cpu_ns = 0,
@@ -283,10 +302,11 @@ stat_startGC (Capability *cap, gc_thread *gct)
    -------------------------------------------------------------------------- */
 
 void
-stat_endGC (Capability *cap, gc_thread *gct,
-            W_ live, W_ copied, W_ slop, uint32_t gen,
-            uint32_t par_n_threads, W_ par_max_copied,
-            W_ par_balanced_copied)
+stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
+            uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
+            W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
+            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
+            W_ scav_find_work)
 {
     // -------------------------------------------------
     // Collect all the stats about this GC in stats.gc. We always do this since
@@ -350,6 +370,13 @@ stat_endGC (Capability *cap, gc_thread *gct,
             stats.gc.par_max_copied_bytes;
         stats.cumulative_par_balanced_copied_bytes +=
             stats.gc.par_balanced_copied_bytes;
+        stats.any_work += any_work;
+        stats.no_work += no_work;
+        stats.scav_find_work += scav_find_work;
+        stats.gc_spin_spin += gc_spin_spin;
+        stats.gc_spin_yield += gc_spin_yield;
+        stats.mut_spin_spin += mut_spin_spin;
+        stats.mut_spin_yield += mut_spin_yield;
     }
     stats.gc_cpu_ns += stats.gc.cpu_ns;
     stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -764,18 +791,96 @@ stat_exit (void)
                                 PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
                     / TimeToSecondsDbl(tot_elapsed));
 
+            // See Note [Internal Counter Stats] for a description of the
+            // following counters. If you add a counter here, please remember
+            // to update the Note.
+            if (RtsFlags.MiscFlags.internalCounters) {
 #if defined(THREADED_RTS) && defined(PROF_SPIN)
-            {
                 uint32_t g;
+                const int32_t col_width[] = {4, -30, 14, 14};
+                statsPrintf("Internal Counters:\n");
+                statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+                            , col_width[0], ""
+                            , col_width[1], "SpinLock"
+                            , col_width[2], "Spins"
+                            , col_width[3], "Yields");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "gc_alloc_block_sync"
+                            , col_width[2], gc_alloc_block_sync.spin
+                            , col_width[3], gc_alloc_block_sync.yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "gc_spin"
+                            , col_width[2], stats.gc_spin_spin
+                            , col_width[3], stats.gc_spin_yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "mut_spin"
+                            , col_width[2], stats.mut_spin_spin
+                            , col_width[3], stats.mut_spin_yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_gc"
+                            , col_width[2], whitehole_gc_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_threadPaused"
+                            , col_width[2], whitehole_threadPaused_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_executeMessage"
+                            , col_width[2], whitehole_executeMessage_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_lockClosure"
+                            , col_width[2], whitehole_lockClosure_spin
+                            , col_width[3], whitehole_lockClosure_yield);
+                // waitForGcThreads isn't really spin-locking(see the function)
+                // but these numbers still seem useful.
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "waitForGcThreads"
+                            , col_width[2], waitForGcThreads_spin
+                            , col_width[3], waitForGcThreads_yield);
 
-                statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
-                statsPrintf("whitehole_gc_spin: %"FMT_Word64"\n"
-                            , whitehole_gc_spin);
                 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
+                    int prefix_length = 0;
+                    statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+                                col_width[0], "", g, &prefix_length);
+                    prefix_length -= col_width[0];
+                    int suffix_length = col_width[1] + prefix_length;
+                    suffix_length =
+                      suffix_length > 0 ? col_width[1] : suffix_length;
+
+                    statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                                , suffix_length, "].sync"
+                                , col_width[2], generations[g].sync.spin
+                                , col_width[3], generations[g].sync.yield);
                 }
-            }
+                statsPrintf("\n");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "any_work"
+                            , col_width[2], stats.any_work);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "no_work"
+                            , col_width[2], stats.no_work);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "scav_find_work"
+                            , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+                statsPrintf("Internal Counters require the RTS to be built "
+                        "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+                statsPrintf("Internal Counters require the threaded RTS");
 #endif
+            }
         }
 
         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
@@ -917,6 +1022,68 @@ the number of gc threads is limited to the number of cores.
 See #13830
 */
 
+/*
+Note [Internal Counter Stats]
+-----------------------------
+What do the counts at the end of a '+RTS -s --internal-counters' report mean?
+They are detailed below. Most of these counters are used by multiple threads
+with no attempt at synchronisation. This means that reported values  may be
+lower than the true value and this becomes more likely and more severe as
+contention increases.
+
+The first counters are for various SpinLock-like constructs in the RTS. See
+Spinlock.h for the definition of a SpinLock. We maintain up two counters per
+SpinLock:
+* spin: The number of busy-spins over the length of the program.
+* yield: The number of times the SpinLock spun SPIN_COUNT times without success
+         and called yieldThread().
+Not all of these are actual SpinLocks, see the details below.
+
+Actual SpinLocks:
+* gc_alloc_block:
+    This SpinLock protects the block allocator and free list manager. See
+    BlockAlloc.c.
+* gc_spin and mut_spin:
+    These SpinLocks are used to herd gc worker threads during parallel garbage
+    collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
+* gen[g].sync:
+    These SpinLocks, one per generation, protect the generations[g] data
+    structure during garbage collection.
+
+waitForGcThreads:
+  These counters are incremented while we wait for all threads to be ready
+  for a parallel garbage collection. We yield more than we spin in this case.
+
+In several places in the runtime we must take a lock on a closure. To do this,
+we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
+don't. We count these white-hole spins and include them in the SpinLocks table.
+If a particular loop does not yield, we put "n/a" in the table. They are named
+for the function that has the spinning loop except that several loops in the
+garbage collector accumulate into whitehole_gc.
+TODO: Should these counters be more or less granular?
+
+white-hole spin counters:
+* whitehole_gc
+* whitehole_lockClosure
+* whitehole_executeMessage
+* whitehole_threadPaused
+
+
+We count the number of calls of several functions in the parallel garbage
+collector.
+
+Parallel garbage collector counters:
+* any_work:
+    A cheap function called whenever a gc_thread is ready for work. Does
+    not do any work.
+* no_work:
+    Incremented whenever any_work finds no work.
+* scav_find_work:
+    Called to do work when any_work return true.
+
+*/
+
 /* -----------------------------------------------------------------------------
    stat_describe_gens
 
diff --git a/rts/Stats.h b/rts/Stats.h
index 5d9cf04fa7..1c56344f81 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,7 +30,10 @@ void      stat_startGCSync(struct gc_thread_ *_gct);
 void      stat_startGC(Capability *cap, struct gc_thread_ *_gct);
 void      stat_endGC  (Capability *cap, struct gc_thread_ *_gct, W_ live,
                        W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
-                       W_ par_max_copied, W_ par_balanced_copied);
+                       W_ par_max_copied, W_ par_balanced_copied,
+                       W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
+                       W_ mut_spin_yield, W_ any_work, W_ no_work,
+                       W_ scav_find_work);
 
 #if defined(PROFILING)
 void      stat_startRP(void);
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index 361989d0d2..595d3ce6c2 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -375,11 +375,21 @@ loop:
     // spin until the WHITEHOLE is updated
     info = StgHeader_info(node);
     if (info == stg_WHITEHOLE_info) {
+#if defined(PROF_SPIN)
+        W_[whitehole_lockClosure_spin] =
+            W_[whitehole_lockClosure_spin] + 1;
+#endif
         i = i + 1;
         if (i == SPIN_COUNT) {
             i = 0;
+#if defined(PROF_SPIN)
+            W_[whitehole_lockClosure_yield] =
+                W_[whitehole_lockClosure_yield] + 1;
+#endif
             ccall yieldThread();
         }
+        // TODO: We should busy_wait_nop() here, but that's not currently
+        // defined in CMM.
         goto loop;
     }
     jump %ENTRY_CODE(info) (node);
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index 7ade0a64c5..3f7bddeb79 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -329,6 +329,10 @@ threadPaused(Capability *cap, StgTSO *tso)
 
             if (cur_bh_info != bh_info) {
                 bh_info = cur_bh_info;
+#if defined(PROF_SPIN)
+                ++whitehole_threadPaused_spin;
+#endif
+                busy_wait_nop();
                 goto retry;
             }
 #endif
diff --git a/rts/ThreadPaused.h b/rts/ThreadPaused.h
index 4d762f9aed..ee25189c20 100644
--- a/rts/ThreadPaused.h
+++ b/rts/ThreadPaused.h
@@ -8,4 +8,12 @@
 
 #pragma once
 
+#include "BeginPrivate.h"
+
 RTS_PRIVATE void threadPaused ( Capability *cap, StgTSO * );
+
+#include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_threadPaused_spin;
+#endif
diff --git a/rts/Threads.c b/rts/Threads.c
index b76917773a..be6962246d 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -165,19 +165,8 @@ rts_getThreadId(StgPtr tso)
 }
 
 /* ---------------------------------------------------------------------------
- * Getting & setting the thread allocation limit
+ * Enabling and disabling the thread allocation limit
  * ------------------------------------------------------------------------ */
-HsInt64 rts_getThreadAllocationCounter(StgPtr tso)
-{
-    // NB. doesn't take into account allocation in the current nursery
-    // block, so it might be off by up to 4k.
-    return PK_Int64((W_*)&(((StgTSO *)tso)->alloc_limit));
-}
-
-void rts_setThreadAllocationCounter(StgPtr tso, HsInt64 i)
-{
-    ASSIGN_Int64((W_*)&(((StgTSO *)tso)->alloc_limit), i);
-}
 
 void rts_enableThreadAllocationLimit(StgPtr tso)
 {
diff --git a/rts/posix/itimer/Pthread.c b/rts/posix/itimer/Pthread.c
index e15ac2521e..c45d57e6dc 100644
--- a/rts/posix/itimer/Pthread.c
+++ b/rts/posix/itimer/Pthread.c
@@ -10,7 +10,7 @@
  * We use a realtime timer by default.  I found this much more
  * reliable than a CPU timer:
  *
- * Experiments with different frequences: using
+ * Experiments with different frequencies: using
  * CLOCK_REALTIME/CLOCK_MONOTONIC on Linux 2.6.32,
  *     1000us has  <1% impact on runtime
  *      100us has  ~2% impact on runtime
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index bb54c7efc3..27f280665e 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -1058,9 +1058,14 @@ selector_chain:
     // In threaded mode, we'll use WHITEHOLE to lock the selector
     // thunk while we evaluate it.
     {
-        do {
+        while(true) {
             info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info);
-        } while (info_ptr == (W_)&stg_WHITEHOLE_info);
+            if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; }
+#if defined(PROF_SPIN)
+            ++whitehole_gc_spin;
+#endif
+            busy_wait_nop();
+        }
 
         // make sure someone else didn't get here first...
         if (IS_FORWARDING_PTR(info_ptr) ||
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 54797ba0f0..d61ca41a6b 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -139,6 +139,9 @@ uint32_t n_gc_threads;
 static long copied;        // *words* copied & scavenged during this GC
 
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
+// spin and yield counts for the quasi-SpinLock in waitForGcThreads
+volatile StgWord64 waitForGcThreads_spin = 0;
+volatile StgWord64 waitForGcThreads_yield = 0;
 volatile StgWord64 whitehole_gc_spin = 0;
 #endif
 
@@ -198,7 +201,9 @@ GarbageCollect (uint32_t collect_gen,
 {
   bdescr *bd;
   generation *gen;
-  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
+  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
+      gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+      any_work, no_work, scav_find_work;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
 #endif
@@ -471,32 +476,53 @@ GarbageCollect (uint32_t collect_gen,
   copied = 0;
   par_max_copied = 0;
   par_balanced_copied = 0;
+  gc_spin_spin = 0;
+  gc_spin_yield = 0;
+  mut_spin_spin = 0;
+  mut_spin_yield = 0;
+  any_work = 0;
+  no_work = 0;
+  scav_find_work = 0;
   {
       uint32_t i;
       uint64_t par_balanced_copied_acc = 0;
+      const gc_thread* thread;
 
       for (i=0; i < n_gc_threads; i++) {
           copied += gc_threads[i]->copied;
       }
       for (i=0; i < n_gc_threads; i++) {
+          thread = gc_threads[i];
           if (n_gc_threads > 1) {
               debugTrace(DEBUG_gc,"thread %d:", i);
-              debugTrace(DEBUG_gc,"   copied           %ld", gc_threads[i]->copied * sizeof(W_));
-              debugTrace(DEBUG_gc,"   scanned          %ld", gc_threads[i]->scanned * sizeof(W_));
-              debugTrace(DEBUG_gc,"   any_work         %ld", gc_threads[i]->any_work);
-              debugTrace(DEBUG_gc,"   no_work          %ld", gc_threads[i]->no_work);
-              debugTrace(DEBUG_gc,"   scav_find_work %ld",   gc_threads[i]->scav_find_work);
+              debugTrace(DEBUG_gc,"   copied           %ld",
+                         thread->copied * sizeof(W_));
+              debugTrace(DEBUG_gc,"   scanned          %ld",
+                         thread->scanned * sizeof(W_));
+              debugTrace(DEBUG_gc,"   any_work         %ld",
+                         thread->any_work);
+              debugTrace(DEBUG_gc,"   no_work          %ld",
+                         thread->no_work);
+              debugTrace(DEBUG_gc,"   scav_find_work %ld",
+                         thread->scav_find_work);
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+              gc_spin_spin += thread->gc_spin.spin;
+              gc_spin_yield += thread->gc_spin.yield;
+              mut_spin_spin += thread->mut_spin.spin;
+              mut_spin_yield += thread->mut_spin.yield;
+#endif
+
+              any_work += thread->any_work;
+              no_work += thread->no_work;
+              scav_find_work += thread->scav_find_work;
+
+              par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+              par_balanced_copied_acc +=
+                  stg_min(n_gc_threads * gc_threads[i]->copied, copied);
           }
-          par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
-          par_balanced_copied_acc +=
-            stg_min(n_gc_threads * gc_threads[i]->copied, copied);
-      }
-      if (n_gc_threads == 1) {
-          par_max_copied = 0;
-          par_balanced_copied = 0;
       }
-      else
-      {
+      if (n_gc_threads > 1) {
           // See Note [Work Balance] for an explanation of this computation
           par_balanced_copied =
               (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
@@ -834,7 +860,9 @@ GarbageCollect (uint32_t collect_gen,
   // ok, GC over: tell the stats department what happened.
   stat_endGC(cap, gct, live_words, copied,
              live_blocks * BLOCK_SIZE_W - live_words /* slop */,
-             N, n_gc_threads, par_max_copied, par_balanced_copied);
+             N, n_gc_threads, par_max_copied, par_balanced_copied,
+             gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+             any_work, no_work, scav_find_work);
 
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
@@ -1186,6 +1214,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
                 }
             }
             if (!retry) break;
+#if defined(PROF_SPIN)
+            waitForGcThreads_yield++;
+#endif
             yieldThread();
         }
 
@@ -1196,6 +1227,14 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
             rtsConfig.longGCSync(cap->no, t2 - t0);
             t1 = t2;
         }
+        if (retry) {
+#if defined(PROF_SPIN)
+            // This is a bit strange, we'll get more yields than spins.
+            // I guess that means it's not a spin-lock at all, but these
+            // numbers are still useful (I think).
+            waitForGcThreads_spin++;
+#endif
+        }
     }
 
     if (RtsFlags.GcFlags.longGCSync != 0 &&
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 78f054931a..7fce87edd4 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -47,6 +47,8 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS,
 
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
 extern volatile StgWord64 whitehole_gc_spin;
+extern volatile StgWord64 waitForGcThreads_spin;
+extern volatile StgWord64 waitForGcThreads_yield;
 #endif
 
 void gcWorkerThread (Capability *cap);
diff --git a/testsuite/tests/cmm/Makefile b/testsuite/tests/cmm/Makefile
new file mode 100644
index 0000000000..9a36a1c5fe
--- /dev/null
+++ b/testsuite/tests/cmm/Makefile
@@ -0,0 +1,3 @@
+TOP=../..
+include $(TOP)/mk/boilerplate.mk
+include $(TOP)/mk/test.mk
diff --git a/testsuite/tests/cmm/should_run/HooplPostorder.hs b/testsuite/tests/cmm/should_run/HooplPostorder.hs
new file mode 100644
index 0000000000..d7a8bbaef1
--- /dev/null
+++ b/testsuite/tests/cmm/should_run/HooplPostorder.hs
@@ -0,0 +1,69 @@
+module Main where
+
+import Hoopl.Block
+import Hoopl.Collections
+import Hoopl.Graph
+import Hoopl.Label
+
+import Data.Maybe
+
+data TestBlock e x = TB { label_ :: Label, successors_ :: [Label] }
+    deriving (Eq, Show)
+
+instance NonLocal TestBlock where
+  entryLabel = label_
+  successors = successors_
+
+-- Test the classical diamond shape graph.
+test_diamond :: LabelMap (TestBlock C C)
+test_diamond = mapFromList $ map (\b -> (label_ b, b)) blocks
+  where
+    blocks =
+        [ TB (mkHooplLabel 1) [mkHooplLabel 2, mkHooplLabel 3]
+        , TB (mkHooplLabel 2) [mkHooplLabel 4]
+        , TB (mkHooplLabel 3) [mkHooplLabel 4]
+        , TB (mkHooplLabel 4) []
+        ]
+
+-- Test that the backedge doesn't change anything.
+test_diamond_backedge :: LabelMap (TestBlock C C)
+test_diamond_backedge = mapFromList $ map (\b -> (label_ b, b)) blocks
+  where
+    blocks =
+        [ TB (mkHooplLabel 1) [mkHooplLabel 2, mkHooplLabel 3]
+        , TB (mkHooplLabel 2) [mkHooplLabel 4]
+        , TB (mkHooplLabel 3) [mkHooplLabel 4]
+        , TB (mkHooplLabel 4) [mkHooplLabel 1]
+        ]
+
+-- Test that the "bypass" edge from 1 to 4 doesn't change anything.
+test_3 :: LabelMap (TestBlock C C)
+test_3 = mapFromList $ map (\b -> (label_ b, b)) blocks
+  where
+    blocks =
+        [ TB (mkHooplLabel 1) [mkHooplLabel 2, mkHooplLabel 4]
+        , TB (mkHooplLabel 2) [mkHooplLabel 4]
+        , TB (mkHooplLabel 4) []
+        ]
+
+-- Like test_3 but with different order of successors for the entry point.
+test_4 :: LabelMap (TestBlock C C)
+test_4 = mapFromList $ map (\b -> (label_ b, b)) blocks
+  where
+    blocks =
+        [ TB (mkHooplLabel 1) [mkHooplLabel 4, mkHooplLabel 2]
+        , TB (mkHooplLabel 2) [mkHooplLabel 4]
+        , TB (mkHooplLabel 4) []
+        ]
+
+
+main :: IO ()
+main = do
+    let result = revPostorderFrom test_diamond (mkHooplLabel 1)
+    putStrLn (show $ map label_ result)
+    let result = revPostorderFrom test_diamond_backedge (mkHooplLabel 1)
+    putStrLn (show $ map label_ result)
+    let result = revPostorderFrom test_3 (mkHooplLabel 1)
+    putStrLn (show $ map label_ result)
+    let result = revPostorderFrom test_4 (mkHooplLabel 1)
+    putStrLn (show $ map label_ result)
diff --git a/testsuite/tests/cmm/should_run/HooplPostorder.stdout b/testsuite/tests/cmm/should_run/HooplPostorder.stdout
new file mode 100644
index 0000000000..7e704c224b
--- /dev/null
+++ b/testsuite/tests/cmm/should_run/HooplPostorder.stdout
@@ -0,0 +1,4 @@
+[L1,L3,L2,L4]
+[L1,L3,L2,L4]
+[L1,L2,L4]
+[L1,L2,L4]
diff --git a/testsuite/tests/cmm/should_run/Makefile b/testsuite/tests/cmm/should_run/Makefile
new file mode 100644
index 0000000000..9101fbd40a
--- /dev/null
+++ b/testsuite/tests/cmm/should_run/Makefile
@@ -0,0 +1,3 @@
+TOP=../../..
+include $(TOP)/mk/boilerplate.mk
+include $(TOP)/mk/test.mk
diff --git a/testsuite/tests/cmm/should_run/all.T b/testsuite/tests/cmm/should_run/all.T
new file mode 100644
index 0000000000..00838075cf
--- /dev/null
+++ b/testsuite/tests/cmm/should_run/all.T
@@ -0,0 +1,4 @@
+test('HooplPostorder',
+     extra_run_opts('"' + config.libdir + '"'),
+     compile_and_run,
+     ['-package ghc'])
diff --git a/testsuite/tests/codeGen/should_run/T5129.hs b/testsuite/tests/codeGen/should_run/T5129.hs
index 6bc1912754..2808f54eae 100644
--- a/testsuite/tests/codeGen/should_run/T5129.hs
+++ b/testsuite/tests/codeGen/should_run/T5129.hs
@@ -10,12 +10,13 @@ throwIfNegative n | n < 0     = error "negative"
 data HUnitFailure = HUnitFailure String deriving (Show,Typeable)
 instance Exception HUnitFailure
 
+assertFailure :: String -> a -- Not an IO function!
 assertFailure msg = E.throw (HUnitFailure msg)
 
-case_negative =
-    handleJust errorCalls (const $ return ()) $ do
-        evaluate $ throwIfNegative (-1)
-        assertFailure "must throw when given a negative number"
+main :: IO ()
+main =
+    handleJust errorCalls (const (return ())) (do
+        evaluate (throwIfNegative (-1)) -- Pure expression evaluated in IO
+        assertFailure "must throw when given a negative number")
   where errorCalls (ErrorCall _) = Just ()
 
-main = case_negative
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 9403c4b1e1..55386e402b 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -90,7 +90,13 @@ test('T3677', extra_run_opts('+RTS -K8k -RTS'), compile_and_run, [''])
 test('T4441', normal, compile_and_run, [''])
 test('T5149', omit_ways(['ghci']), multi_compile_and_run,
                  ['T5149', [('T5149_cmm.cmm', '')], ''])
-test('T5129', normal, compile_and_run, [''])
+
+test('T5129',
+     # The bug is in simplifier when run with -O1 and above, so only run it
+     # optimised, using any backend.
+     only_ways(['optasm']),
+     compile_and_run, [''])
+
 test('T5626', exit_code(1), compile_and_run, [''])
 test('T5747', when(arch('i386'), extra_hc_opts('-msse2')), compile_and_run, ['-O2'])
 test('T5785', normal, compile_and_run, [''])
diff --git a/testsuite/tests/rename/should_compile/T14881.hs b/testsuite/tests/rename/should_compile/T14881.hs
new file mode 100644
index 0000000000..c1b955c0f2
--- /dev/null
+++ b/testsuite/tests/rename/should_compile/T14881.hs
@@ -0,0 +1,5 @@
+module T14881 where
+
+import qualified T14881Aux as Aux (L(Cons), x, tail, adjust, length)
+
+x = Aux.Cons
diff --git a/testsuite/tests/rename/should_compile/T14881.stderr b/testsuite/tests/rename/should_compile/T14881.stderr
new file mode 100644
index 0000000000..bfb6ca913f
--- /dev/null
+++ b/testsuite/tests/rename/should_compile/T14881.stderr
@@ -0,0 +1,6 @@
+[1 of 2] Compiling T14881Aux        ( T14881Aux.hs, T14881Aux.o )
+[2 of 2] Compiling T14881           ( T14881.hs, T14881.o )
+
+T14881.hs:3:1: warning: [-Wunused-imports (in -Wextra)]
+    The qualified import of ‘adjust, length, L(tail), L(x)’
+    from module ‘T14881Aux’ is redundant
diff --git a/testsuite/tests/rename/should_compile/T14881Aux.hs b/testsuite/tests/rename/should_compile/T14881Aux.hs
new file mode 100644
index 0000000000..13b8f31d04
--- /dev/null
+++ b/testsuite/tests/rename/should_compile/T14881Aux.hs
@@ -0,0 +1,13 @@
+module T14881Aux where
+
+-- unambiguous function name.
+adjust :: ()
+adjust = undefined
+
+-- ambiguous function name.
+length :: ()
+length = undefined
+
+data L = Cons { x :: Int      -- unambiguous field selector
+              , tail :: [Int] -- ambiguous field selector
+              }
diff --git a/testsuite/tests/rename/should_compile/all.T b/testsuite/tests/rename/should_compile/all.T
index 4eb584febe..80bcb092e0 100644
--- a/testsuite/tests/rename/should_compile/all.T
+++ b/testsuite/tests/rename/should_compile/all.T
@@ -153,3 +153,4 @@ test('T12548', normal, compile, [''])
 test('T13132', normal, compile, [''])
 test('T13646', normal, compile, [''])
 test('LookupSub', [], multimod_compile, ['LookupSub', '-v0'])
+test('T14881', [], multimod_compile, ['T14881', '-W'])
diff --git a/testsuite/tests/rts/InternalCounters.stdout b/testsuite/tests/rts/InternalCounters.stdout
new file mode 100644
index 0000000000..d764d7bc19
--- /dev/null
+++ b/testsuite/tests/rts/InternalCounters.stdout
@@ -0,0 +1 @@
+Internal Counters:
+\ No newline at end of file
diff --git a/testsuite/tests/rts/Makefile b/testsuite/tests/rts/Makefile
index ded3be1b3b..630508542d 100644
--- a/testsuite/tests/rts/Makefile
+++ b/testsuite/tests/rts/Makefile
@@ -48,13 +48,13 @@ T5423:
 .PHONY: T9405
 T9405:
 	@'$(TEST_HC)' $(TEST_HC_OPTS) -ticky -rtsopts T9405.hs; \
-    ./T9405 +RTS -rT9405.ticky & \
-    sleep 0.2; \
-    kill -2 $$!; \
-    wait $$!; \
-    [ -e T9405.ticky ] || echo "Error: Ticky profile doesn't exist"; \
-    [ -s T9405.ticky ] || echo "Error: Ticky profile is empty"; \
-    echo Ticky-Ticky;
+		./T9405 +RTS -rT9405.ticky & \
+		sleep 0.2; \
+		kill -2 $$!; \
+		wait $$!; \
+		[ -e T9405.ticky ] || echo "Error: Ticky profile doesn't exist"; \
+		[ -s T9405.ticky ] || echo "Error: Ticky profile is empty"; \
+		echo Ticky-Ticky;
 
 # Naming convention: 'T5423_' obj-way '_' obj-src
 # obj-way ::= v | dyn
@@ -178,3 +178,8 @@ T12497:
 .PHONY: T14695
 T14695:
 	echo ":quit" | LD_LIBRARY_PATH="foo:" "$(TEST_HC)" $(filter-out -rtsopts, $(TEST_HC_OPTS_INTERACTIVE))
+
+.PHONY: InternalCounters
+InternalCounters:
+	"$(TEST_HC)" +RTS -s --internal-counters -RTS 2>&1 | grep "Internal Counters"
+	-"$(TEST_HC)" +RTS -s -RTS 2>&1 | grep "Internal Counters"
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 5000a914db..ffbd05c745 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -390,3 +390,11 @@ test('T14702', [ ignore_stdout
                , compile_and_run, [''])
 
 test('T14900', normal, compile_and_run, ['-package ghc-compact'])
+test('InternalCounters', normal, run_command,
+  ['$MAKE -s --no-print-directory InternalCounters'])
+test('alloccounter1', normal, compile_and_run,
+  [
+    # avoid allocating stack chunks, which counts as
+    # allocation and messes up the results:
+    '-with-rtsopts=-k1m'
+  ])
diff --git a/testsuite/tests/rts/alloccounter1.hs b/testsuite/tests/rts/alloccounter1.hs
new file mode 100644
index 0000000000..4b81896d2c
--- /dev/null
+++ b/testsuite/tests/rts/alloccounter1.hs
@@ -0,0 +1,19 @@
+module Main where
+
+import Control.Exception
+import Control.Monad
+import Data.List
+import System.Mem
+
+main = do
+  let
+    testAlloc n = do
+      let start = 999999
+      setAllocationCounter start
+      evaluate (last [1..n])
+      c <- getAllocationCounter
+      -- print (start - c)
+      return (start - c)
+  results <- forM [1..1000] testAlloc
+  print (sort results == results)
+    -- results better be in ascending order
diff --git a/testsuite/tests/rts/alloccounter1.stdout b/testsuite/tests/rts/alloccounter1.stdout
new file mode 100644
index 0000000000..0ca95142bb
--- /dev/null
+++ b/testsuite/tests/rts/alloccounter1.stdout
@@ -0,0 +1 @@
+True
diff --git a/testsuite/tests/rts/flags/Makefile b/testsuite/tests/rts/flags/Makefile
index 61900477f9..9101fbd40a 100644
--- a/testsuite/tests/rts/flags/Makefile
+++ b/testsuite/tests/rts/flags/Makefile
@@ -1,6 +1,3 @@
 TOP=../../..
 include $(TOP)/mk/boilerplate.mk
 include $(TOP)/mk/test.mk
-
-T1791:
-	'$(TEST_HC)' T1791.hs -o T1791 -O -rtsopts
diff --git a/testsuite/tests/rts/flags/T12870.hs b/testsuite/tests/rts/flags/T12870.hs
index 8d536d58d6..69086cde2b 100644
--- a/testsuite/tests/rts/flags/T12870.hs
+++ b/testsuite/tests/rts/flags/T12870.hs
@@ -1,4 +1,5 @@
-module T12870 where
+--We check if RTS arguments are properly filtered/passed along
+--by outputting them to stdout.
 
 import System.Environment
 
diff --git a/testsuite/tests/rts/flags/T12870g.hs b/testsuite/tests/rts/flags/T12870g.hs
index e409349827..3efd633ddd 100644
--- a/testsuite/tests/rts/flags/T12870g.hs
+++ b/testsuite/tests/rts/flags/T12870g.hs
@@ -1,4 +1,5 @@
-module T12870g where
+--We check the generation count as a way to verify an RTS argument
+--was actually parsed and accepted by the RTS.
 
 import GHC.RTS.Flags (getGCFlags, generations)
 
diff --git a/testsuite/tests/rts/flags/all.T b/testsuite/tests/rts/flags/all.T
index 33a28e500a..548fcd8631 100644
--- a/testsuite/tests/rts/flags/all.T
+++ b/testsuite/tests/rts/flags/all.T
@@ -1,44 +1,51 @@
 #Standard handling of RTS arguments
 test('T12870a',
-    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs'])],
+    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
+        omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870', '-rtsopts -main-is T12870'])
+    ['T12870', '-rtsopts'])
 
 test('T12870b',
     [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
-        exit_code(1), ignore_stderr],
+        exit_code(1), ignore_stderr, omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870', '-rtsopts=none -main-is T12870'])
+    ['T12870', '-rtsopts=none'])
 
 test('T12870c',
     [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
-    exit_code(1)],
+        exit_code(1), omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870', '-rtsopts=some -main-is T12870'])
+    ['T12870', '-rtsopts=some'])
 
 test('T12870d',
-    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs'])],
+    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
+        omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870', '-main-is T12870'])
+    ['T12870', ''])
 
 #RTS options should be passed along to the program
 test('T12870e',
-    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs'])],
+    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
+        omit_ways(['ghci', 'threaded2'])],
     multimod_compile_and_run,
-    ['T12870', '-rtsopts=ignore -main-is T12870'])
+    ['T12870', '-rtsopts=ignore'])
+
 test('T12870f',
-    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs'])],
+    [extra_run_opts('+RTS -G2 -RTS arg1 --RTS +RTS arg2'), extra_files(['T12870.hs']),
+        omit_ways(['ghci', 'threaded2'])],
     multimod_compile_and_run,
-    ['T12870', '-rtsopts=ignoreAll -main-is T12870'])
+    ['T12870', '-rtsopts=ignoreAll'])
 
 #Check handling of env variables
 test('T12870g',
-    [extra_files(['T12870g.hs']), cmd_prefix('GHCRTS=-G7 '), extra_files(['T12870g.hs'])],
+    [extra_files(['T12870g.hs']), cmd_prefix('GHCRTS=-G7 '), extra_files(['T12870g.hs']),
+        omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870g', '-rtsopts -main-is T12870g -with-rtsopts="-G3"'])
+    ['T12870g', '-rtsopts -with-rtsopts="-G3"'])
 
 test('T12870h',
-    [extra_files(['T12870g.hs']), cmd_prefix('GHCRTS=-G7 '), extra_files(['T12870g.hs'])],
+    [extra_files(['T12870g.hs']), cmd_prefix('GHCRTS=-G7 '), extra_files(['T12870g.hs']),
+        omit_ways(['ghci'])],
     multimod_compile_and_run,
-    ['T12870g', '-rtsopts=ignoreAll -main-is T12870g -with-rtsopts="-G3"'])
+    ['T12870g', '-rtsopts=ignoreAll -with-rtsopts="-G3"'])
 
diff --git a/testsuite/tests/th/T13776.hs b/testsuite/tests/th/T13776.hs
new file mode 100644
index 0000000000..6082825592
--- /dev/null
+++ b/testsuite/tests/th/T13776.hs
@@ -0,0 +1,23 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+module T13776 where
+
+import Language.Haskell.TH
+
+spliceTy1 :: $(conT ''(,) `appT` conT ''Int `appT` conT ''Int)
+spliceTy1 = (1,2)
+
+spliceTy2 :: $(conT ''[] `appT` conT ''Int)
+spliceTy2 = []
+
+spliceExp1 :: (Int, Int)
+spliceExp1 = $(conE '(,) `appE` litE (integerL 1) `appE` litE (integerL 1))
+
+spliceExp2 :: [Int]
+spliceExp2 = $(conE '[])
+
+splicePat1 :: (Int, Int) -> ()
+splicePat1 $(conP '(,) [litP (integerL 1), litP (integerL 1)]) = ()
+
+splicePat2 :: [Int] -> ()
+splicePat2 $(conP '[] []) = ()
diff --git a/testsuite/tests/th/T13776.stderr b/testsuite/tests/th/T13776.stderr
new file mode 100644
index 0000000000..485dc64a28
--- /dev/null
+++ b/testsuite/tests/th/T13776.stderr
@@ -0,0 +1,14 @@
+T13776.hs:10:16-42: Splicing type
+    conT ''[] `appT` conT ''Int ======> [] Int
+T13776.hs:7:16-61: Splicing type
+    conT ''(,) `appT` conT ''Int `appT` conT ''Int ======> (,) Int Int
+T13776.hs:14:16-74: Splicing expression
+    conE '(,) `appE` litE (integerL 1) `appE` litE (integerL 1)
+  ======>
+    ((,) 1) 1
+T13776.hs:17:16-23: Splicing expression
+    conE '[] ======> []
+T13776.hs:20:14-61: Splicing pattern
+    conP '(,) [litP (integerL 1), litP (integerL 1)] ======> (,) 1 1
+T13776.hs:23:14-24: Splicing pattern
+    conP '[] [] ======> []
diff --git a/testsuite/tests/th/T3319.stderr b/testsuite/tests/th/T3319.stderr
index 44ec90ffe7..b88b10f90f 100644
--- a/testsuite/tests/th/T3319.stderr
+++ b/testsuite/tests/th/T3319.stderr
@@ -4,4 +4,4 @@ T3319.hs:8:3-93: Splicing declarations
          (ImportF
             CCall Unsafe "&" (mkName "foo") (AppT (ConT ''Ptr) (ConT ''())))]
   ======>
-    foreign import ccall unsafe "&" foo :: Ptr GHC.Tuple.()
+    foreign import ccall unsafe "&" foo :: Ptr ()
diff --git a/testsuite/tests/th/T5700.stderr b/testsuite/tests/th/T5700.stderr
index 729a36604f..3564b8cb2a 100644
--- a/testsuite/tests/th/T5700.stderr
+++ b/testsuite/tests/th/T5700.stderr
@@ -3,4 +3,4 @@ T5700.hs:8:3-9: Splicing declarations
   ======>
     instance C D where
       {-# INLINE inlinable #-}
-      inlinable _ = GHC.Tuple.()
+      inlinable _ = ()
diff --git a/testsuite/tests/th/TH_foreignInterruptible.stderr b/testsuite/tests/th/TH_foreignInterruptible.stderr
index 7131eeee71..4afc38aab1 100644
--- a/testsuite/tests/th/TH_foreignInterruptible.stderr
+++ b/testsuite/tests/th/TH_foreignInterruptible.stderr
@@ -8,4 +8,4 @@ TH_foreignInterruptible.hs:8:3-100: Splicing declarations
             (mkName "foo")
             (AppT (ConT ''Ptr) (ConT ''())))]
   ======>
-    foreign import ccall interruptible "&" foo :: Ptr GHC.Tuple.()
+    foreign import ccall interruptible "&" foo :: Ptr ()
diff --git a/testsuite/tests/th/all.T b/testsuite/tests/th/all.T
index e9f2838492..b51059ca1c 100644
--- a/testsuite/tests/th/all.T
+++ b/testsuite/tests/th/all.T
@@ -403,5 +403,6 @@ test('T14838', [], multimod_compile,
      ['T14838.hs', '-v0 -Wincomplete-patterns ' + config.ghc_th_way_flags])
 test('T14817', normal, compile, ['-v0 -ddump-splices -dsuppress-uniques'])
 test('T14843', normal, compile, ['-v0'])
+test('T13776', normal, compile, ['-ddump-splices -v0'])
 test('T14888', normal, compile,
     ['-v0 -ddump-splices -dsuppress-uniques ' + config.ghc_th_way_flags])
diff --git a/testsuite/tests/typecheck/should_compile/Makefile b/testsuite/tests/typecheck/should_compile/Makefile
index c3065f3cd2..ac18b2f4b3 100644
--- a/testsuite/tests/typecheck/should_compile/Makefile
+++ b/testsuite/tests/typecheck/should_compile/Makefile
@@ -70,3 +70,8 @@ T13585:
 	'$(TEST_HC)' $(TEST_HC_OPTS) -c T13585a.hs -O
 	'$(TEST_HC)' $(TEST_HC_OPTS) -c T13585b.hs -O
 	'$(TEST_HC)' $(TEST_HC_OPTS) -c T13585.hs  -O
+
+T14934:
+	$(RM) -f T14934a.o T14934a.hi T14934.o T14934.hi
+	'$(TEST_HC)' $(TEST_HC_OPTS) -c T14934a.hs -O
+	'$(TEST_HC)' $(TEST_HC_OPTS) -c T14934.hs  -O
diff --git a/testsuite/tests/typecheck/should_compile/T14934.hs b/testsuite/tests/typecheck/should_compile/T14934.hs
new file mode 100644
index 0000000000..581e93186e
--- /dev/null
+++ b/testsuite/tests/typecheck/should_compile/T14934.hs
@@ -0,0 +1,9 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE TypeOperators #-}
+module T14934 where
+
+import T14934a
+import GHC.TypeLits
+
+g :: Foo (1 - 0)
+g = f MkFoo1
diff --git a/testsuite/tests/typecheck/should_compile/T14934a.hs b/testsuite/tests/typecheck/should_compile/T14934a.hs
new file mode 100644
index 0000000000..3ba59ff976
--- /dev/null
+++ b/testsuite/tests/typecheck/should_compile/T14934a.hs
@@ -0,0 +1,14 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE KindSignatures #-}
+{-# LANGUAGE TypeOperators #-}
+module T14934a where
+
+import GHC.TypeLits
+
+data Foo :: Nat -> * where
+  MkFoo0 :: Foo 0
+  MkFoo1 :: Foo 1
+
+f :: Foo (1 - 0) -> Foo 1
+f x = x
diff --git a/testsuite/tests/typecheck/should_compile/all.T b/testsuite/tests/typecheck/should_compile/all.T
index 2d3c3cd118..9a2ce73263 100644
--- a/testsuite/tests/typecheck/should_compile/all.T
+++ b/testsuite/tests/typecheck/should_compile/all.T
@@ -597,3 +597,5 @@ test('T14732', normal, compile, [''])
 test('T14774', [], run_command, ['$MAKE -s --no-print-directory T14774'])
 test('T14763', normal, compile, [''])
 test('T14811', normal, compile, [''])
+test('T14934', [extra_files(['T14934.hs', 'T14934a.hs'])], run_command,
+               ['$MAKE -s --no-print-directory T14934'])
diff --git a/testsuite/tests/typecheck/should_fail/T14048a.hs b/testsuite/tests/typecheck/should_fail/T14048a.hs
new file mode 100644
index 0000000000..c717127df8
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048a.hs
@@ -0,0 +1,6 @@
+{-# LANGUAGE ConstraintKinds #-}
+module T14048a where
+
+import Data.Kind
+
+data Foo :: Constraint
diff --git a/testsuite/tests/typecheck/should_fail/T14048a.stderr b/testsuite/tests/typecheck/should_fail/T14048a.stderr
new file mode 100644
index 0000000000..48a91c7525
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048a.stderr
@@ -0,0 +1,5 @@
+
+T14048a.hs:6:1: error:
+    • Kind signature on data type declaration has non-* return kind
+        Constraint
+    • In the data declaration for ‘Foo’
diff --git a/testsuite/tests/typecheck/should_fail/T14048b.hs b/testsuite/tests/typecheck/should_fail/T14048b.hs
new file mode 100644
index 0000000000..d2f6f74583
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048b.hs
@@ -0,0 +1,7 @@
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE TypeFamilies #-}
+module T14048b where
+
+import Data.Kind
+
+data family Foo :: Constraint
diff --git a/testsuite/tests/typecheck/should_fail/T14048b.stderr b/testsuite/tests/typecheck/should_fail/T14048b.stderr
new file mode 100644
index 0000000000..fe78d9f7f5
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048b.stderr
@@ -0,0 +1,6 @@
+
+T14048b.hs:7:1: error:
+    • Kind signature on data type declaration has non-*
+      and non-variable return kind
+        Constraint
+    • In the data family declaration for ‘Foo’
diff --git a/testsuite/tests/typecheck/should_fail/T14048c.hs b/testsuite/tests/typecheck/should_fail/T14048c.hs
new file mode 100644
index 0000000000..e81e454d31
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048c.hs
@@ -0,0 +1,9 @@
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE PolyKinds #-}
+{-# LANGUAGE TypeFamilies #-}
+module T14048c where
+
+import Data.Kind
+
+data family   Foo :: k
+data instance Foo :: Constraint
diff --git a/testsuite/tests/typecheck/should_fail/T14048c.stderr b/testsuite/tests/typecheck/should_fail/T14048c.stderr
new file mode 100644
index 0000000000..7e83d1924c
--- /dev/null
+++ b/testsuite/tests/typecheck/should_fail/T14048c.stderr
@@ -0,0 +1,5 @@
+
+T14048c.hs:9:1: error:
+    • Kind signature on data type declaration has non-* return kind
+        Constraint
+    • In the data instance declaration for ‘Foo’
diff --git a/testsuite/tests/typecheck/should_fail/all.T b/testsuite/tests/typecheck/should_fail/all.T
index 5377fefeb7..f01cbe1da7 100644
--- a/testsuite/tests/typecheck/should_fail/all.T
+++ b/testsuite/tests/typecheck/should_fail/all.T
@@ -457,6 +457,9 @@ test('T14000', normal, compile_fail, [''])
 test('T14055', normal, compile_fail, [''])
 test('T13909', normal, compile_fail, [''])
 test('T13929', normal, compile_fail, [''])
+test('T14048a', normal, compile_fail, [''])
+test('T14048b', normal, compile_fail, [''])
+test('T14048c', normal, compile_fail, [''])
 test('T14232', normal, compile_fail, [''])
 test('T14325', normal, compile_fail, [''])
 test('T14350', normal, compile_fail, [''])
diff --git a/utils/llvm-targets/gen-data-layout.sh b/utils/llvm-targets/gen-data-layout.sh
index 834a978809..4f0b24076a 100755
--- a/utils/llvm-targets/gen-data-layout.sh
+++ b/utils/llvm-targets/gen-data-layout.sh
@@ -16,17 +16,28 @@
 # Add missing targets to the list below to have them included in
 # llvm-targets file.
 
-# Target sets
-WINDOWS_x86="i386-unknown-windows i686-unknown-windows x86_64-unknown-windows"
-LINUX_ARM="arm-unknown-linux-gnueabihf armv6-unknown-linux-gnueabihf armv7-unknown-linux-gnueabihf aarch64-unknown-linux-gnu aarch64-unknown-linux armv7a-unknown-linux-gnueabi"
-LINUX_x86="i386-unknown-linux-gnu i386-unknown-linux x86_64-unknown-linux-gnu x86_64-unknown-linux"
-ANDROID="armv7-unknown-linux-androideabi aarch64-unknown-linux-android"
-QNX="arm-unknown-nto-qnx-eabi"
-MACOS="i386-apple-darwin x86_64-apple-darwin"
-IOS="armv7-apple-ios arm64-apple-ios i386-apple-ios x86_64-apple-ios"
+# Target sets for which to generate the llvm-targets file
+TARGETS=(
+    # Windows x86
+    "i386-unknown-windows" "i686-unknown-windows" "x86_64-unknown-windows"
 
-# targets for which to generate the llvm-targets file
-TARGETS="${WINDOWS_x86} ${LINUX_ARM} ${LINUX_x86} ${ANDROID} ${QNX} ${MACOS} ${IOS}"
+    # Linux ARM
+    "arm-unknown-linux-gnueabihf" "armv6-unknown-linux-gnueabihf"
+    "armv7-unknown-linux-gnueabihf" "armv7a-unknown-linux-gnueabi"
+    "aarch64-unknown-linux-gnu" "aarch64-unknown-linux"
+    # Linux x86
+    "i386-unknown-linux-gnu" "i386-unknown-linux" "x86_64-unknown-linux-gnu" "x86_64-unknown-linux"
+    # Linux Android
+    "armv7-unknown-linux-androideabi" "aarch64-unknown-linux-android"
+
+    # QNX
+    "arm-unknown-nto-qnx-eabi"
+
+    # macOS
+    "i386-apple-darwin" "x86_64-apple-darwin"
+    # iOS
+    "armv7-apple-ios arm64-apple-ios" "i386-apple-ios x86_64-apple-ios"
+)
 
 # given the call to clang -c11 that clang --target -v generates,
 # parse the -target-cpu <CPU> and -target-feature <feature> from
@@ -61,7 +72,7 @@ FST=1
 FILE=_____dummy.c
 touch $FILE
 
-for target in $TARGETS; do
+for target in "${TARGETS[@]}"; do
     # find the cpu and attributes emitte by clang for the given $target
     CPU=""
     ATTR=()