8 files changed, 544 insertions, 214 deletions
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs
index 4c883e7185..a40bf02013 100644
--- a/compiler/nativeGen/AsmCodeGen.hs
+++ b/compiler/nativeGen/AsmCodeGen.hs
@@ -534,6 +534,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
  = do
         let platform = targetPlatform dflags
 
+        let proc_name = case cmm of
+                (CmmProc _ entry_label _ _) -> ppr entry_label
+                _                           -> text "DataChunk"
+
         -- rewrite assignments to global regs
         let fixed_cmm =
                 {-# SCC "fixStgRegisters" #-}
@@ -562,12 +566,11 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 Opt_D_dump_asm_native "Native code"
                 (vcat $ map (pprNatCmmDecl ncgImpl) native)
 
-        when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags
-                Opt_D_dump_cfg_weights "CFG Weights"
-                (pprEdgeWeights nativeCfgWeights)
+        maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name
 
         -- tag instructions with register liveness information
-        -- also drops dead code
+        -- also drops dead code. We don't keep the cfg in sync on
+        -- some backends, so don't use it there.
         let livenessCfg = if (backendMaintainsCfg dflags)
                                 then Just nativeCfgWeights
                                 else Nothing
@@ -705,10 +708,7 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
             optimizedCFG =
                 optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG
 
-        maybe (return ()) (\cfg->
-                dumpIfSet_dyn dflags Opt_D_dump_cfg_weights "CFG Final Weights"
-                ( pprEdgeWeights cfg ))
-                optimizedCFG
+        maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
 
         --TODO: Partially check validity of the cfg.
         let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks
@@ -771,6 +771,15 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 , ppr_raStatsLinear
                 , unwinds )
 
+maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO ()
+maybeDumpCfg _dflags Nothing _ _ = return ()
+maybeDumpCfg dflags (Just cfg) msg proc_name
+        | null cfg = return ()
+        | otherwise
+        = dumpIfSet_dyn
+                dflags Opt_D_dump_cfg_weights msg
+                (proc_name <> char ':' $$ pprEdgeWeights cfg)
+
 -- | Make sure all blocks we want the layout algorithm to place have been placed.
 checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
             -> [NatCmmDecl statics instr]
diff --git a/compiler/nativeGen/CFG.hs b/compiler/nativeGen/CFG.hs
index 8eb69a9dbf..fb17d269a8 100644
--- a/compiler/nativeGen/CFG.hs
+++ b/compiler/nativeGen/CFG.hs
@@ -82,7 +82,6 @@ import PprCmm () -- For Outputable instances
 import qualified DynFlags as D
 
 import Data.List
-
 import Data.STRef.Strict
 import Control.Monad.ST
 
@@ -109,6 +108,13 @@ instance Outputable EdgeWeight where
 type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)
 
 -- | A control flow graph where edges have been annotated with a weight.
+-- Implemented as IntMap (IntMap <edgeData>)
+-- We must uphold the invariant that for each edge A -> B we must have:
+-- A entry B in the outer map.
+-- A entry B in the map we get when looking up A.
+-- Maintaining this invariant is useful as any failed lookup now indicates
+-- an actual error in code which might go unnoticed for a while
+-- otherwise.
 type CFG = EdgeInfoMap EdgeInfo
 
 data CfgEdge
@@ -199,13 +205,20 @@ setEdgeWeight cfg !weight from to
   | otherwise = cfg
 
 
-
-getCfgNodes :: CFG -> LabelSet
+getCfgNodes :: CFG -> [BlockId]
 getCfgNodes m =
-    mapFoldlWithKey (\s k toMap -> mapFoldlWithKey (\s k _ -> setInsert k s) (setInsert k s) toMap ) setEmpty m
+    mapKeys m
 
+-- | Is this block part of this graph?
 hasNode :: CFG -> BlockId -> Bool
-hasNode m node = mapMember node m || any (mapMember node) m
+hasNode m node =
+  -- Check the invariant that each node must exist in the first map or not at all.
+  ASSERT( found || not (any (mapMember node) m))
+  found
+    where
+      found = mapMember node m
+
+
 
 -- | Check if the nodes in the cfg and the set of blocks are the same.
 --   In a case of a missmatch we panic and show the difference.
@@ -217,11 +230,11 @@ sanityCheckCfg m blockSet msg
         pprPanic "Block list and cfg nodes don't match" (
             text "difference:" <+> ppr diff $$
             text "blocks:" <+> ppr blockSet $$
-            text "cfg:" <+> ppr m $$
+            text "cfg:" <+> pprEdgeWeights m $$
             msg )
             False
     where
-      cfgNodes = getCfgNodes m :: LabelSet
+      cfgNodes = setFromList $ getCfgNodes m :: LabelSet
       diff = (setUnion cfgNodes blockSet) `setDifference` (setIntersection cfgNodes blockSet) :: LabelSet
 
 -- | Filter the CFG with a custom function f.
@@ -332,10 +345,16 @@ addImmediateSuccessor node follower cfg
 -- | Adds a new edge, overwrites existing edges if present
 addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
 addEdge from to info cfg =
-    mapAlter addDest from cfg
+    mapAlter addFromToEdge from $
+    mapAlter addDestNode to cfg
     where
-        addDest Nothing = Just $ mapSingleton to info
-        addDest (Just wm) = Just $ mapInsert to info wm
+        -- Simply insert the edge into the edge list.
+        addFromToEdge Nothing = Just $ mapSingleton to info
+        addFromToEdge (Just wm) = Just $ mapInsert to info wm
+        -- We must add the destination node explicitly
+        addDestNode Nothing = Just $ mapEmpty
+        addDestNode n@(Just _) = n
+
 
 -- | Adds a edge with the given weight to the cfg
 --   If there already existed an edge it is overwritten.
@@ -366,8 +385,11 @@ getSuccEdgesSorted m bid =
         sortedEdges
 
 -- | Get successors of a given node with edge weights.
-getSuccessorEdges :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
-getSuccessorEdges m bid = maybe [] mapToList $ mapLookup bid m
+getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m)
+  where
+    lookupError = pprPanic "getSuccessorEdges: Block does not exist" $
+                    ppr bid <+> pprEdgeWeights m
 
 getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
 getEdgeInfo from to m
@@ -389,7 +411,7 @@ getTransitionSource from to cfg = transitionSource $ expectJust "Source info for
 reverseEdges :: CFG -> CFG
 reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
   where
-    -- We preserve nodes without outgoing edges!
+    -- We must preserve nodes without outgoing edges!
     addNode :: CFG -> BlockId -> CFG
     addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
     go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
@@ -427,11 +449,14 @@ edgeList m =
       = go' froms from tos ((from,to) : acc)
 
 -- | Get successors of a given node without edge weights.
-getSuccessors :: CFG -> BlockId -> [BlockId]
+getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
 getSuccessors m bid
     | Just wm <- mapLookup bid m
     = mapKeys wm
-    | otherwise = []
+    | otherwise = lookupError
+    where
+      lookupError = pprPanic "getSuccessors: Block does not exist" $
+                    ppr bid <+> pprEdgeWeights m
 
 pprEdgeWeights :: CFG -> SDoc
 pprEdgeWeights m =
@@ -455,6 +480,7 @@ pprEdgeWeights m =
     text "}\n"
 
 {-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
+-- | Invariant: The edge **must** exist already in the graph.
 updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
 updateEdgeWeight f (from, to) cfg
     | Just oldInfo <- getEdgeInfo from to cfg
@@ -503,7 +529,7 @@ addNodesBetween m updates =
         = pprPanic "Can't find weight for edge that should have one" (
             text "triple" <+> ppr (from,between,old) $$
             text "updates" <+> ppr updates $$
-            text "cfg:" <+> ppr m )
+            text "cfg:" <+> pprEdgeWeights m )
       updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
       updateWeight m (from,between,old,edgeInfo)
         = addEdge from between edgeInfo .
@@ -634,7 +660,7 @@ getCfg weights graph =
     blocks = revPostorder graph :: [CmmBlock]
 
 --Find back edges by BFS
-findBackEdges :: BlockId -> CFG -> Edges
+findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges
 findBackEdges root cfg =
     --pprTraceIt "Backedges:" $
     map fst .
@@ -714,7 +740,7 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg =
                     (adjustEdgeWeight cfg  (+mod1) node s1)
               | otherwise
               = cfg
-        in setFoldl update cfg nodes
+        in foldl' update cfg nodes
       where
         fallthroughTarget :: BlockId -> EdgeInfo -> Bool
         fallthroughTarget to (EdgeInfo source _weight)
@@ -726,13 +752,13 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg =
 
 -- | Determine loop membership of blocks based on SCC analysis
 --   This is faster but only gives yes/no answers.
-loopMembers :: CFG -> LabelMap Bool
+loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
 loopMembers cfg =
     foldl' (flip setLevel) mapEmpty sccs
   where
     mkNode :: BlockId -> Node BlockId BlockId
     mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
-    nodes = map mkNode (setElems $ getCfgNodes cfg)
+    nodes = map mkNode (getCfgNodes cfg)
 
     sccs = stronglyConnCompFromEdgedVerticesOrd nodes
 
@@ -741,7 +767,9 @@ loopMembers cfg =
     setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
 
 loopLevels :: CFG -> BlockId -> LabelMap Int
-loopLevels cfg root = liLevels $ loopInfo cfg root
+loopLevels cfg root = liLevels loopInfos
+    where
+      loopInfos = loopInfo cfg root
 
 data LoopInfo = LoopInfo
   { liBackEdges :: [(Edge)] -- ^ List of back edges
@@ -754,23 +782,39 @@ instance Outputable LoopInfo where
         text "Loops:(backEdge, bodyNodes)" $$
             (vcat $ map ppr loops)
 
+{-  Note [Determining the loop body]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Starting with the knowledge that:
+    * head dominates the loop
+    * `tail` -> `head` is a backedge
+
+    We can determine all nodes by:
+    * Deleting the loop head from the graph.
+    * Collect all blocks which are reachable from the `tail`.
+
+    We do so by performing bfs from the tail node towards the head.
+ -}
+
 -- | Determine loop membership of blocks based on Dominator analysis.
 --   This is slower but gives loop levels instead of just loop membership.
 --   However it only detects natural loops. Irreducible control flow is not
 --   recognized even if it loops. But that is rare enough that we don't have
 --   to care about that special case.
-loopInfo :: CFG -> BlockId -> LoopInfo
+loopInfo :: HasDebugCallStack => CFG -> BlockId -> LoopInfo
 loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
                               , liLevels = mapFromList loopCounts
                               , liLoops = loopBodies }
   where
     revCfg = reverseEdges cfg
-    graph = fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet
+
+    graph = -- pprTrace "CFG - loopInfo" (pprEdgeWeights cfg) $
+            fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet
+
 
     --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ...
     rooted = ( fromBlockId root
               , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet)
-    -- rooted = unsafeCoerce (root, graph)
     tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId
 
     -- Map from Nodes to their dominators
@@ -778,8 +822,8 @@ loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
     domMap = mkDomMap tree
 
     edges = edgeList cfg :: [(BlockId, BlockId)]
-    -- We can't recompute this from the edges, there might be blocks not connected via edges.
-    nodes = getCfgNodes cfg :: LabelSet
+    -- We can't recompute nodes from edges, there might be blocks not connected via edges.
+    nodes = getCfgNodes cfg :: [BlockId]
 
     -- identify back edges
     isBackEdge (from,to)
@@ -788,22 +832,26 @@ loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
       = True
       | otherwise = False
 
-    -- determine the loop body for a back edge
+    -- See Note [Determining the loop body]
+    -- Get the loop body associated with a back edge.
     findBody edge@(tail, head)
       = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) )
       where
-        -- The reversed cfg makes it easier to look up predecessors
+        -- See Note [Determining the loop body]
         cfg' = delNode head revCfg
+
         go :: LabelSet -> LabelSet -> LabelSet
         go found current
           | setNull current = found
           | otherwise = go  (setUnion newSuccessors found)
                             newSuccessors
           where
+            -- Really predecessors, since we use the reversed cfg.
             newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet
             successors = setFromList $ concatMap
                                       (getSuccessors cfg')
-                                      (setElems current) :: LabelSet
+                                      -- we filter head as it's no longer part of the cfg.
+                                      (filter (/= head) $ setElems current) :: LabelSet
 
     backEdges = filter isBackEdge edges
     loopBodies = map findBody backEdges :: [(Edge, LabelSet)]
@@ -812,7 +860,7 @@ loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
     loopCounts =
       let bodies = map (first snd) loopBodies -- [(Header, Body)]
           loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies
-      in  map (\n -> (n, loopCount n)) $ setElems nodes :: [(BlockId, Int)]
+      in  map (\n -> (n, loopCount n)) $ nodes :: [(BlockId, Int)]
 
     toIntSet :: LabelSet -> IntSet
     toIntSet s = IS.fromList . map fromBlockId . setElems $ s
@@ -845,12 +893,12 @@ instance G.NonLocal (BlockNode) where
   entryLabel (BN (lbl,_))   = lbl
   successors (BN (_,succs)) = succs
 
-revPostorderFrom :: CFG -> BlockId -> [BlockId]
+revPostorderFrom :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
 revPostorderFrom cfg root =
     map fromNode $ G.revPostorderFrom hooplGraph root
   where
     nodes = getCfgNodes cfg
-    hooplGraph = setFoldl (\m n -> mapInsert n (toNode n) m) mapEmpty nodes
+    hooplGraph = foldl' (\m n -> mapInsert n (toNode n) m) mapEmpty nodes
 
     fromNode :: BlockNode C C -> BlockId
     fromNode (BN x) = fst x
@@ -876,14 +924,13 @@ revPostorderFrom cfg root =
 --
 --   We also apply a few prediction heuristics (based on the same paper)
 
+{-# NOINLINE mkGlobalWeights #-}
 {-# SCC mkGlobalWeights #-}
-mkGlobalWeights :: BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double))
+mkGlobalWeights :: HasDebugCallStack => BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double))
 mkGlobalWeights root localCfg
   | null localCfg = panic "Error - Empty CFG"
   | otherwise
-  = --pprTrace "revOrder" (ppr revOrder) $
-    -- undefined --propagate (mapSingleton root 1) (revOrder)
-    (blockFreqs', edgeFreqs')
+  = (blockFreqs', edgeFreqs')
   where
     -- Calculate fixpoints
     (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder'
@@ -894,13 +941,13 @@ mkGlobalWeights root localCfg
     fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m
 
     revOrder = revPostorderFrom localCfg root :: [BlockId]
-    loopinfo@(LoopInfo backedges _levels bodies) = loopInfo localCfg root
+    loopResults@(LoopInfo backedges _levels bodies) = loopInfo localCfg root
 
     revOrder' = map toVertex revOrder
     backEdges' = map (bimap toVertex toVertex) backedges
     bodies' = map calcBody bodies
 
-    estimatedCfg = staticBranchPrediction root loopinfo localCfg
+    estimatedCfg = staticBranchPrediction root loopResults localCfg
     -- Normalize the weights to probabilities and apply heuristics
     nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex
 
@@ -965,7 +1012,7 @@ type TargetNodeInfo = (BlockId, EdgeInfo)
 staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG
 staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg =
     -- pprTrace "staticEstimatesOn" (ppr (cfg)) $
-    setFoldl update cfg nodes
+    foldl' update cfg nodes
   where
     nodes = getCfgNodes cfg
     backedges = S.fromList $ l_backEdges
@@ -1248,8 +1295,10 @@ calcFreqs graph backEdges loops revPostOrder = runST $ do
 
     return (freqs', graph')
   where
+    -- How can these lookups fail? Consider the CFG [A -> B]
     predecessors :: Int -> IS.IntSet
     predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph
+    successors :: Int -> [Int]
     successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph
     lookupError s b g = pprPanic ("Lookup error " ++ s) $
                             ( text "node" <+> ppr b $$
diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs
index cf3c58844f..71503aa653 100644
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE CPP #-}
 {-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE BangPatterns #-}
 
 -- -----------------------------------------------------------------------------
 --
@@ -204,7 +205,8 @@ addImportNat imp
 
 updateCfgNat :: (CFG -> CFG) -> NatM ()
 updateCfgNat f
-        = NatM $ \ st -> ((), st { natm_cfg = f (natm_cfg st) })
+        = NatM $ \ st -> let !cfg' = f (natm_cfg st)
+                         in ((), st { natm_cfg = cfg'})
 
 -- | Record that we added a block between `from` and `old`.
 addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM ()
diff --git a/compiler/nativeGen/RegAlloc/Liveness.hs b/compiler/nativeGen/RegAlloc/Liveness.hs
index 3f160ea678..a5a9b503cd 100644
--- a/compiler/nativeGen/RegAlloc/Liveness.hs
+++ b/compiler/nativeGen/RegAlloc/Liveness.hs
@@ -705,8 +705,8 @@ sccBlocks blocks entries mcfg = map (fmap node_payload) sccs
         reachable :: LabelSet
         reachable
             | Just cfg <- mcfg
-            -- Our CFG only contains reachable nodes by construction.
-            = getCfgNodes cfg
+            -- Our CFG only contains reachable nodes by construction at this point.
+            = setFromList $ getCfgNodes cfg
             | otherwise
             = setFromList $ [ node_key node | node <- reachablesG g1 roots ]
 
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index b1dd9c58ad..1807bdcea1 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -39,6 +39,7 @@ import GhcPrelude
 import X86.Instr
 import X86.Cond
 import X86.Regs
+import X86.Ppr (  )
 import X86.RegInfo
 
 import GHC.Platform.Regs
@@ -137,6 +138,56 @@ cmmTopCodeGen (CmmProc info lab live graph) = do
 cmmTopCodeGen (CmmData sec dat) = do
   return [CmmData sec (mkAlignment 1, dat)]  -- no translation, we just use CmmStatic
 
+{- Note [Verifying basic blocks]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   We want to guarantee a few things about the results
+   of instruction selection.
+
+   Namely that each basic blocks consists of:
+    * A (potentially empty) sequence of straight line instructions
+  followed by
+    * A (potentially empty) sequence of jump like instructions.
+
+    We can verify this by going through the instructions and
+    making sure that any non-jumpish instruction can't appear
+    after a jumpish instruction.
+
+    There are gotchas however:
+    * CALLs are strictly speaking control flow but here we care
+      not about them. Hence we treat them as regular instructions.
+
+      It's safe for them to appear inside a basic block
+      as (ignoring side effects inside the call) they will result in
+      straight line code.
+
+    * NEWBLOCK marks the start of a new basic block so can
+      be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+  | debugIsOn     = go False instrs
+  | otherwise     = ()
+  where
+    go _     [] = ()
+    go atEnd (i:instr)
+        = case i of
+            -- Start a new basic block
+            NEWBLOCK {} -> go False instr
+            -- Calls are not viable block terminators
+            CALL {}     | atEnd -> faultyBlockWith i
+                        | not atEnd -> go atEnd instr
+            -- All instructions ok, check if we reached the end and continue.
+            _ | not atEnd -> go (isJumpishInstr i) instr
+              -- Only jumps allowed at the end of basic blocks.
+              | otherwise -> if isJumpishInstr i
+                                then go True instr
+                                else faultyBlockWith i
+    faultyBlockWith i
+        = pprPanic "Non control flow instructions after end of basic block."
+                   (ppr i <+> text "in:" $$ vcat (map ppr instrs))
 
 basicBlockCodeGen
         :: CmmBlock
@@ -155,9 +206,10 @@ basicBlockCodeGen block = do
             let line = srcSpanStartLine span; col = srcSpanStartCol span
             return $ unitOL $ LOCATION fileId line col name
     _ -> return nilOL
-  mid_instrs <- stmtsToInstrs id stmts
-  (!tail_instrs,_) <- stmtToInstrs id tail
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
   let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  return $! verifyBasicBlock (fromOL instrs)
   instrs' <- fold <$> traverse addSpUnwindings instrs
   -- code generation may introduce new basic block boundaries, which
   -- are indicated by the NEWBLOCK instruction.  We must split up the
@@ -251,12 +303,12 @@ basic block.
 -- See Note [Keeping track of the current block] for why
 -- we pass the BlockId.
 stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
-              -> [CmmNode e x] -- ^ Cmm Statement
-              -> NatM InstrBlock -- ^ Resulting instruction
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
 stmtsToInstrs bid stmts =
     go bid stmts nilOL
   where
-    go _   []         instr = return instr
+    go bid  []        instrs = return (instrs,bid)
     go bid (s:stmts)  instrs = do
       (instrs',bid') <- stmtToInstrs bid s
       -- If the statement introduced a new block, we use that one
@@ -1822,6 +1874,109 @@ genCondBranch' _ bid id false bool = do
         updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
         return (cond_code `appOL` code)
 
+{-  Note [Introducing cfg edges inside basic blocks]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    During instruction selection a statement `s`
+    in a block B with control of the sort: B -> C
+    will sometimes result in control
+    flow of the sort:
+
+            ┌ < ┐
+            v   ^
+      B ->  B1  ┴ -> C
+
+    as is the case for some atomic operations.
+
+    Now to keep the CFG in sync when introducing B1 we clearly
+    want to insert it between B and C. However there is
+    a catch when we have to deal with self loops.
+
+    We might start with code and a CFG of these forms:
+
+    loop:
+        stmt1               ┌ < ┐
+        ....                v   ^
+        stmtX              loop ┘
+        stmtY
+        ....
+        goto loop:
+
+    Now we introduce B1:
+                            ┌ ─ ─ ─ ─ ─┐
+        loop:               │   ┌ <  ┐ │
+        instrs              v   │    │ ^
+        ....               loop ┴ B1 ┴ ┘
+        instrsFromX
+        stmtY
+        goto loop:
+
+    This is simple, all outgoing edges from loop now simply
+    start from B1 instead and the code generator knows which
+    new edges it introduced for the self loop of B1.
+
+    Disaster strikes if the statement Y follows the same pattern.
+    If we apply the same rule that all outgoing edges change then
+    we end up with:
+
+        loop ─> B1 ─> B2 ┬─┐
+          │      │    └─<┤ │
+          │      └───<───┘ │
+          └───────<────────┘
+
+    This is problematic. The edge B1->B1 is modified as expected.
+    However the modification is wrong!
+
+    The assembly in this case looked like this:
+
+    _loop:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+        cmpxchgq ...
+        jne _B2
+        <instrs>
+        jmp loop
+
+    There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+    The problem here is that really B1 should be two basic blocks.
+    Otherwise we have control flow in the *middle* of a basic block.
+    A contradiction!
+
+    So to account for this we add yet another basic block marker:
+
+    _B:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        jmp _B1'
+    _B1':
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+
+    Now when inserting B2 we will only look at the outgoing edges of B1' and
+    everything will work out nicely.
+
+    You might also wonder why we don't insert jumps at the end of _B1'. There is
+    no way another block ends up jumping to the labels _B1 or _B2 since they are
+    essentially invisible to other blocks. View them as control flow labels local
+    to the basic block if you'd like.
+
+    Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
 -- -----------------------------------------------------------------------------
 --  Generating C calls
 
@@ -1889,26 +2044,34 @@ genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
         cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
                      -> NatM (OrdList Instr, BlockId)
         cmpxchg_code instrs = do
-            lbl <- getBlockIdNat
+            lbl1 <- getBlockIdNat
+            lbl2 <- getBlockIdNat
             tmp <- getNewRegNat format
 
             --Record inserted blocks
-            addImmediateSuccessorNat bid lbl
-            updateCfgNat (addWeightEdge lbl lbl 0)
+            --  We turn A -> B into A -> A' -> A'' -> B
+            --  with a self loop on A'.
+            addImmediateSuccessorNat bid lbl1
+            addImmediateSuccessorNat lbl1 lbl2
+            updateCfgNat (addWeightEdge lbl1 lbl1 0)
 
             return $ (toOL
                 [ MOV format (OpAddr amode) (OpReg eax)
-                , JXX ALWAYS lbl
-                , NEWBLOCK lbl
+                , JXX ALWAYS lbl1
+                , NEWBLOCK lbl1
                   -- Keep old value so we can return it:
                 , MOV format (OpReg eax) (OpReg dst_r)
                 , MOV format (OpReg eax) (OpReg tmp)
                 ]
                 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
                 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
-                , JXX NE lbl
+                , JXX NE lbl1
+                -- See Note [Introducing cfg edges inside basic blocks]
+                -- why this basic block is required.
+                , JXX ALWAYS lbl2
+                , NEWBLOCK lbl2
                 ],
-                lbl)
+                lbl2)
     format = intFormat width
 
 genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index d4502a0088..7e47860143 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -292,7 +292,9 @@ data Instr
                       [Maybe JumpDest] -- Targets of the jump table
                       Section   -- Data section jump table should be put in
                       CLabel    -- Label of jump table
-        | CALL        (Either Imm Reg) [Reg]
+        -- | X86 call instruction
+        | CALL        (Either Imm Reg) -- ^ Jump target
+                      [Reg]            -- ^ Arguments (required for register allocation)
 
         -- Other things.
         | CLTD Format            -- sign extend %eax into %edx:%eax
diff --git a/compiler/utils/Dominators.hs b/compiler/utils/Dominators.hs
index 9877c2c1f0..d6d8404564 100644
--- a/compiler/utils/Dominators.hs
+++ b/compiler/utils/Dominators.hs
@@ -53,9 +53,12 @@ import Control.Monad
 import Control.Monad.ST.Strict
 
 import Data.Array.ST
-import Data.Array.Base
-  (unsafeNewArray_
-  ,unsafeWrite,unsafeRead)
+import Data.Array.Base hiding ((!))
+  -- (unsafeNewArray_
+  -- ,unsafeWrite,unsafeRead
+  -- ,readArray,writeArray)
+
+import Util (debugIsOn)
 
 -----------------------------------------------------------------------------
 
@@ -399,13 +402,19 @@ infixr 2 .=
 
 (.=) :: (MArray (A s) a (ST s))
      => Arr s a -> a -> Int -> ST s ()
-(v .= x) i = unsafeWrite v i x
+(v .= x) i
+  | debugIsOn = writeArray v i x
+  | otherwise = unsafeWrite v i x
 
 (!:) :: (MArray (A s) a (ST s))
      => A s Int a -> Int -> ST s a
-a !: i = do
-  o <- unsafeRead a i
-  return $! o
+a !: i
+  | debugIsOn = do
+      o <- readArray a i
+      return $! o
+  | otherwise = do
+      o <- unsafeRead a i
+      return $! o
 
 new :: (MArray (A s) a (ST s))
     => Int -> ST s (Arr s a)
diff --git a/testsuite/tests/codeGen/should_compile/T17334.hs b/testsuite/tests/codeGen/should_compile/T17334.hs
index 27c0742aa7..6ad6d347e8 100644
--- a/testsuite/tests/codeGen/should_compile/T17334.hs
+++ b/testsuite/tests/codeGen/should_compile/T17334.hs
@@ -1,144 +1,240 @@
--- Reproducer for T17334
-{-# LANGUAGE BangPatterns          #-}
-{-# LANGUAGE FlexibleContexts      #-}
-{-# LANGUAGE MagicHash             #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
-{-# LANGUAGE TypeFamilies          #-}
-{-# LANGUAGE UnboxedTuples         #-}
-
-module T17334 where
-
-import Control.Monad.ST
-import Data.Bits
-import Data.Kind
-import GHC.Exts
-import GHC.ST (ST(..))
-
-reverseInPlace :: UMVector s Bit -> ST s ()
-reverseInPlace xs = loop 0
- where
-  len = 4
-
-  loop !i
-    | i' < j = do
-      let w = 1
-          k = 2
-      x <- return 1
-      y <- return 2
-
-      writeWord xs i (meld w (reversePartialWord w y) x)
-
-      loop i'
-
-   where
-    !j  = 5
-    !i' = i + wordSize
-
-newtype Bit = Bit { unBit :: Bool }
-
-instance Unbox Bit
-
-data instance UMVector s Bit = BitMVec !Int !Int !(MutableByteArray s)
-data instance UVector    Bit = BitVec  !Int !Int !ByteArray
-
--- {-# NOINLINE writeWord #-}
-writeWord :: UMVector s Bit -> Int -> Word -> ST s ()
-writeWord !(BitMVec _ 0 _) _ _ = pure ()
-writeWord !(BitMVec off len' arr@(MutableByteArray mba)) !i' !x@(W# x#) = do
-  let len    = 5
-      lenMod = 6
-      i      = 7
-      nMod   = 8
-      loIx@(I# loIx#)   = 9
-
-  do
-        let W# andMask# = hiMask lenMod
-            W# orMask#  = x .&. loMask lenMod
-        primitive $ \state ->
-          let !(# state',  _ #) = fetchAndIntArray# mba loIx# (word2Int# andMask#) state  in
-            let !(# state'', _ #) = fetchOrIntArray#  mba loIx# (word2Int# orMask#)  state' in
-              (# state'', () #)
-
-instance GMVector UMVector Bit where
-  {-# INLINE basicLength #-}
-  basicLength (BitMVec _ n _) = n
-
-instance GVector UVector Bit where
-
-wordSize :: Int
-wordSize = 10
-
-lgWordSize :: Int
-lgWordSize = 11
-
-modWordSize :: Int -> Int
-modWordSize x = 12
-
-mask :: Int -> Word
-mask b = 13
-
-meld :: Int -> Word -> Word -> Word
-meld b lo hi = 14
-{-# INLINE meld #-}
-
-reverseWord :: Word -> Word
-reverseWord x0 = 15
-
-reversePartialWord :: Int -> Word -> Word
-reversePartialWord n w = 16
-
-loMask :: Int -> Word
-loMask n = 17
-
-hiMask :: Int -> Word
-hiMask n = 18
-
-class GMVector v a where
-  basicLength :: v s a -> Int
-
-type family GMutable (v :: Type -> Type) :: Type -> Type -> Type
-class GMVector (GMutable v) a => GVector v a
-data family UMVector s a
-data family UVector    a
-class (GVector UVector a, GMVector UMVector a) => Unbox a
-type instance GMutable UVector = UMVector
-
-data ByteArray = ByteArray ByteArray#
-data MutableByteArray s = MutableByteArray (MutableByteArray# s)
-
-readByteArray
-  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> m a
-{-# INLINE readByteArray #-}
-readByteArray (MutableByteArray arr#) (I# i#)
-  = primitive (readByteArray# arr# i#)
-
-writeByteArray
-  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> a -> m ()
-{-# INLINE writeByteArray #-}
-writeByteArray (MutableByteArray arr#) (I# i#) x
-  = primitive_ (writeByteArray# arr# i# x)
-
-class Prim a where
-  readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
-  writeByteArray# :: MutableByteArray# s -> Int# -> a -> State# s -> State# s
-
-instance Prim Word where
-  readByteArray#  arr# i# s# = case readWordArray# arr# i# s# of
-                                 (# s1#, x# #) -> (# s1#, W# x# #)
-  writeByteArray# arr# i# (W# x#) s# = writeWordArray# arr# i# x# s#
-
-class Monad m => PrimMonad m where
-  type PrimState m
-  primitive :: (State# (PrimState m) -> (# State# (PrimState m), a #)) -> m a
-
-instance PrimMonad (ST s) where
-  type PrimState (ST s) = s
-  primitive = ST
-  {-# INLINE primitive #-}
-
-primitive_ :: PrimMonad m
-              => (State# (PrimState m) -> State# (PrimState m)) -> m ()
-{-# INLINE primitive_ #-}
-primitive_ f = primitive (\s# ->
-    case f s# of
-        s'# -> (# s'#, () #))
+-- Reproducer for T17334
+{-# LANGUAGE BangPatterns          #-}
+{-# LANGUAGE FlexibleContexts      #-}
+{-# LANGUAGE MagicHash             #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE TypeFamilies          #-}
+{-# LANGUAGE UnboxedTuples         #-}
+
+--Reproducer uses 64bit literals in reverseWord.
+--It's ok to truncate those in x86
+{-# OPTIONS_GHC -Wno-overflowed-literals #-}
+
+module Bug (reverseInPlace) where
+
+import Control.Monad.ST
+import Data.Bits
+import GHC.Exts
+import GHC.ST (ST(..))
+import Data.Kind
+
+reverseInPlace :: PrimMonad m => UMVector (PrimState m) Bit -> m ()
+reverseInPlace xs | len == 0  = pure ()
+                  | otherwise = loop 0
+ where
+  len = ulength xs
+
+  loop !i
+    | i' <= j' = do
+      x <- readWord xs i
+      y <- readWord xs j'
+
+      writeWord xs i  (reverseWord y)
+      writeWord xs j' (reverseWord x)
+
+      loop i'
+    | i' < j = do
+      let w = (j - i) `shiftR` 1
+          k = j - w
+      x <- readWord xs i
+      y <- readWord xs k
+
+      writeWord xs i (meld w (reversePartialWord w y) x)
+      writeWord xs k (meld w (reversePartialWord w x) y)
+
+      loop i'
+    | otherwise = do
+      let w = j - i
+      x <- readWord xs i
+      writeWord xs i (meld w (reversePartialWord w x) x)
+   where
+    !j  = len - i
+    !i' = i + wordSize
+    !j' = j - wordSize
+{-# SPECIALIZE reverseInPlace :: UMVector s Bit -> ST s () #-}
+
+newtype Bit = Bit { unBit :: Bool }
+
+instance Unbox Bit
+
+data instance UMVector s Bit = BitMVec !Int !Int !(MutableByteArray s)
+data instance UVector    Bit = BitVec  !Int !Int !ByteArray
+
+readWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> m Word
+readWord !(BitMVec _ 0 _) _ = pure 0
+readWord !(BitMVec off len' arr) !i' = do
+  let len  = off + len'
+      i    = off + i'
+      nMod = modWordSize i
+      loIx = divWordSize i
+  loWord <- readByteArray arr loIx
+
+  if nMod == 0
+    then pure loWord
+    else if loIx == divWordSize (len - 1)
+      then pure (loWord `unsafeShiftR` nMod)
+      else do
+        hiWord <- readByteArray arr (loIx + 1)
+        pure
+          $   (loWord `unsafeShiftR` nMod)
+          .|. (hiWord `unsafeShiftL` (wordSize - nMod))
+{-# SPECIALIZE readWord :: UMVector s Bit -> Int -> ST s Word #-}
+{-# INLINE readWord #-}
+
+writeWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> Word -> m ()
+writeWord !(BitMVec _ 0 _) _ _ = pure ()
+writeWord !(BitMVec off len' arr@(MutableByteArray mba)) !i' !x@(W# x#) = do
+  let len    = off + len'
+      lenMod = modWordSize len
+      i      = off + i'
+      nMod   = modWordSize i
+      loIx@(I# loIx#)   = divWordSize i
+
+  if nMod == 0
+    then if len >= i + wordSize
+      then primitive $ \state ->
+        (# atomicWriteIntArray# mba loIx# (word2Int# x#) state, () #)
+      else do
+        let W# andMask# = hiMask lenMod
+            W# orMask#  = x .&. loMask lenMod
+        primitive $ \state ->
+          let !(# state',  _ #) = fetchAndIntArray# mba loIx# (word2Int# andMask#) state  in
+            let !(# state'', _ #) = fetchOrIntArray#  mba loIx# (word2Int# orMask#)  state' in
+              (# state'', () #)
+    else if loIx == divWordSize (len - 1)
+      then do
+        loWord <- readByteArray arr loIx
+        if lenMod == 0
+          then
+            writeByteArray arr loIx
+            $   (loWord .&. loMask nMod)
+            .|. (x `unsafeShiftL` nMod)
+          else
+            writeByteArray arr loIx
+            $   (loWord .&. (loMask nMod .|. hiMask lenMod))
+            .|. ((x `unsafeShiftL` nMod) .&. loMask lenMod)
+      else do
+        loWord <- readByteArray arr loIx
+        writeByteArray arr loIx
+          $   (loWord .&. loMask nMod)
+          .|. (x `unsafeShiftL` nMod)
+        hiWord <- readByteArray arr (loIx + 1)
+        writeByteArray arr (loIx + 1)
+          $   (hiWord .&. hiMask nMod)
+          .|. (x `unsafeShiftR` (wordSize - nMod))
+{-# SPECIALIZE writeWord :: UMVector s Bit -> Int -> Word -> ST s () #-}
+{-# INLINE writeWord #-}
+
+instance GMVector UMVector Bit where
+  {-# INLINE basicLength #-}
+  basicLength (BitMVec _ n _) = n
+
+instance GVector UVector Bit where
+
+wordSize :: Int
+wordSize = finiteBitSize (0 :: Word)
+
+lgWordSize :: Int
+lgWordSize = case wordSize of
+  32 -> 5
+  64 -> 6
+  _  -> error "wordsToBytes: unknown architecture"
+
+divWordSize :: Bits a => a -> a
+divWordSize x = unsafeShiftR x lgWordSize
+{-# INLINE divWordSize #-}
+
+modWordSize :: Int -> Int
+modWordSize x = x .&. (wordSize - 1)
+{-# INLINE modWordSize #-}
+
+mask :: Int -> Word
+mask b = m
+ where
+  m | b >= finiteBitSize m = complement 0
+    | b < 0                = 0
+    | otherwise            = bit b - 1
+
+meld :: Int -> Word -> Word -> Word
+meld b lo hi = (lo .&. m) .|. (hi .&. complement m) where m = mask b
+{-# INLINE meld #-}
+
+reverseWord :: Word -> Word
+reverseWord x0 = x6
+ where
+  x1 = ((x0 .&. 0x5555555555555555) `shiftL`  1) .|. ((x0 .&. 0xAAAAAAAAAAAAAAAA) `shiftR`  1)
+  x2 = ((x1 .&. 0x3333333333333333) `shiftL`  2) .|. ((x1 .&. 0xCCCCCCCCCCCCCCCC) `shiftR`  2)
+  x3 = ((x2 .&. 0x0F0F0F0F0F0F0F0F) `shiftL`  4) .|. ((x2 .&. 0xF0F0F0F0F0F0F0F0) `shiftR`  4)
+  x4 = ((x3 .&. 0x00FF00FF00FF00FF) `shiftL`  8) .|. ((x3 .&. 0xFF00FF00FF00FF00) `shiftR`  8)
+  x5 = ((x4 .&. 0x0000FFFF0000FFFF) `shiftL` 16) .|. ((x4 .&. 0xFFFF0000FFFF0000) `shiftR` 16)
+  x6 = ((x5 .&. 0x00000000FFFFFFFF) `shiftL` 32) .|. ((x5 .&. 0xFFFFFFFF00000000) `shiftR` 32)
+
+reversePartialWord :: Int -> Word -> Word
+reversePartialWord n w | n >= wordSize = reverseWord w
+                       | otherwise     = reverseWord w `shiftR` (wordSize - n)
+
+loMask :: Int -> Word
+loMask n = 1 `unsafeShiftL` n - 1
+{-# INLINE loMask #-}
+
+hiMask :: Int -> Word
+hiMask n = complement (1 `unsafeShiftL` n - 1)
+{-# INLINE hiMask #-}
+
+class GMVector v a where
+  basicLength :: v s a -> Int
+
+glength :: GMVector v a => v s a -> Int
+{-# INLINE glength #-}
+glength = basicLength
+
+type family GMutable (v :: Type -> Type) :: Type -> Type -> Type
+class GMVector (GMutable v) a => GVector v a
+data family UMVector s a
+data family UVector    a
+class (GVector UVector a, GMVector UMVector a) => Unbox a
+type instance GMutable UVector = UMVector
+
+ulength :: Unbox a => UMVector s a -> Int
+{-# INLINE ulength #-}
+ulength = glength
+
+data ByteArray = ByteArray ByteArray#
+data MutableByteArray s = MutableByteArray (MutableByteArray# s)
+
+readByteArray
+  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> m a
+{-# INLINE readByteArray #-}
+readByteArray (MutableByteArray arr#) (I# i#)
+  = primitive (readByteArray# arr# i#)
+
+writeByteArray
+  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> a -> m ()
+{-# INLINE writeByteArray #-}
+writeByteArray (MutableByteArray arr#) (I# i#) x
+  = primitive_ (writeByteArray# arr# i# x)
+
+class Prim a where
+  readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
+  writeByteArray# :: MutableByteArray# s -> Int# -> a -> State# s -> State# s
+
+instance Prim Word where
+  readByteArray#  arr# i# s# = case readWordArray# arr# i# s# of
+                                 (# s1#, x# #) -> (# s1#, W# x# #)
+  writeByteArray# arr# i# (W# x#) s# = writeWordArray# arr# i# x# s#
+
+class Monad m => PrimMonad m where
+  type PrimState m
+  primitive :: (State# (PrimState m) -> (# State# (PrimState m), a #)) -> m a
+
+instance PrimMonad (ST s) where
+  type PrimState (ST s) = s
+  primitive = ST
+  {-# INLINE primitive #-}
+
+primitive_ :: PrimMonad m
+              => (State# (PrimState m) -> State# (PrimState m)) -> m ()
+{-# INLINE primitive_ #-}
+primitive_ f = primitive (\s# ->
+    case f s# of
+        s'# -> (# s'#, () #))