48 files changed, 3594 insertions, 132 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3e0db17aa4..2d33eaef13 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -740,14 +740,6 @@ validate-aarch64-darwin:
   tags:
     - aarch64-linux
 
-.build-aarch64-linux-deb10-llvm:
-  extends: .build-aarch64-linux-deb10
-  stage: full-build
-  variables:
-    BUILD_FLAVOUR: perf-llvm
-  tags:
-    - aarch64-linux
-
 validate-aarch64-linux-deb10:
   extends: .build-aarch64-linux-deb10
   artifacts:
@@ -760,6 +752,20 @@ nightly-aarch64-linux-deb10:
   variables:
     TEST_TYPE: slowtest
 
+.build-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10
+  stage: full-build
+  variables:
+    BUILD_FLAVOUR: perf-llvm
+  tags:
+    - aarch64-linux
+
+validate-aarch64-linux-deb10-llvm:
+  extends: .build-aarch64-linux-deb10-llvm
+  artifacts:
+    when: always
+    expire_in: 2 week
+
 #################################
 # armv7-linux-deb10
 #################################
diff --git a/compiler/GHC/Cmm/CLabel.hs b/compiler/GHC/Cmm/CLabel.hs
index cda35a4943..b6ad2b3431 100644
--- a/compiler/GHC/Cmm/CLabel.hs
+++ b/compiler/GHC/Cmm/CLabel.hs
@@ -287,6 +287,12 @@ data CLabel
 
   deriving Eq
 
+instance Show CLabel where
+  show = showPprUnsafe . pprDebugCLabel genericPlatform
+
+instance Outputable CLabel where
+  ppr = text . show
+
 isIdLabel :: CLabel -> Bool
 isIdLabel IdLabel{} = True
 isIdLabel _ = False
@@ -1544,6 +1550,7 @@ pprDynamicLinkerAsmLabel !platform dllInfo ppLbl =
             SymbolPtr       -> char 'L' <> ppLbl <> text "$non_lazy_ptr"
             GotSymbolPtr    -> ppLbl <> text "@GOTPCREL"
             GotSymbolOffset -> ppLbl
+        | platformArch platform == ArchAArch64 -> ppLbl
         | otherwise ->
           case dllInfo of
             CodeStub  -> char 'L' <> ppLbl <> text "$stub"
@@ -1572,6 +1579,10 @@ pprDynamicLinkerAsmLabel !platform dllInfo ppLbl =
           SymbolPtr -> text ".LC_" <> ppLbl
           _         -> panic "pprDynamicLinkerAsmLabel"
 
+      | platformArch platform == ArchAArch64
+      = ppLbl
+
+
       | platformArch platform == ArchX86_64
       = case dllInfo of
           CodeStub        -> ppLbl <> text "@plt"
diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs
index 86b06271d1..52cb63c901 100644
--- a/compiler/GHC/Cmm/Expr.hs
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -65,6 +65,7 @@ data CmmExpr
         --        ** is shorthand only, meaning **
         -- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
         --      where rep = typeWidth (cmmRegType reg)
+  deriving Show
 
 instance Eq CmmExpr where       -- Equality ignores the types
   CmmLit l1          == CmmLit l2          = l1==l2
@@ -78,7 +79,7 @@ instance Eq CmmExpr where       -- Equality ignores the types
 data CmmReg
   = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
-  deriving( Eq, Ord )
+  deriving( Eq, Ord, Show )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
@@ -86,7 +87,7 @@ data Area
   = Old            -- See Note [Old Area]
   | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
                    -- See Note [Continuation BlockId] in GHC.Cmm.Node.
-  deriving (Eq, Ord)
+  deriving (Eq, Ord, Show)
 
 {- Note [Old Area]
 ~~~~~~~~~~~~~~~~~~
@@ -209,7 +210,7 @@ data CmmLit
                      -- During the stack-layout pass, CmmHighStackMark
                      -- is replaced by a CmmInt for the actual number
                      -- of bytes used
-  deriving Eq
+  deriving (Eq, Show)
 
 instance Outputable CmmLit where
   ppr (CmmInt n w) = text "CmmInt" <+> ppr n <+> ppr w
@@ -279,6 +280,7 @@ data LocalReg
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
+  deriving Show
 
 instance Eq LocalReg where
   (LocalReg u1 _) == (LocalReg u2 _) = u1 == u2
diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y
index 490a3c4976..b8a6f7de7c 100644
--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -1021,6 +1021,45 @@ machOps = listToUFM $
 callishMachOps :: Platform -> UniqFM FastString ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
 callishMachOps platform = listToUFM $
         map (\(x, y) -> (mkFastString x, y)) [
+
+        ( "pow64f", (MO_F64_Pwr,) ),
+        ( "sin64f", (MO_F64_Sin,) ),
+        ( "cos64f", (MO_F64_Cos,) ),
+        ( "tan64f", (MO_F64_Tan,) ),
+        ( "sinh64f", (MO_F64_Sinh,) ),
+        ( "cosh64f", (MO_F64_Cosh,) ),
+        ( "tanh64f", (MO_F64_Tanh,) ),
+        ( "asin64f", (MO_F64_Asin,) ),
+        ( "acos64f", (MO_F64_Acos,) ),
+        ( "atan64f", (MO_F64_Atan,) ),
+        ( "asinh64f", (MO_F64_Asinh,) ),
+        ( "acosh64f", (MO_F64_Acosh,) ),
+        ( "log64f", (MO_F64_Log,) ),
+        ( "log1p64f", (MO_F64_Log1P,) ),
+        ( "exp64f", (MO_F64_Exp,) ),
+        ( "expM164f", (MO_F64_ExpM1,) ),
+        ( "fabs64f", (MO_F64_Fabs,) ),
+        ( "sqrt64f", (MO_F64_Sqrt,) ),
+
+        ( "pow32f", (MO_F32_Pwr,) ),
+        ( "sin32f", (MO_F32_Sin,) ),
+        ( "cos32f", (MO_F32_Cos,) ),
+        ( "tan32f", (MO_F32_Tan,) ),
+        ( "sinh32f", (MO_F32_Sinh,) ),
+        ( "cosh32f", (MO_F32_Cosh,) ),
+        ( "tanh32f", (MO_F32_Tanh,) ),
+        ( "asin32f", (MO_F32_Asin,) ),
+        ( "acos32f", (MO_F32_Acos,) ),
+        ( "atan32f", (MO_F32_Atan,) ),
+        ( "asinh32f", (MO_F32_Asinh,) ),
+        ( "acosh32f", (MO_F32_Acosh,) ),
+        ( "log32f", (MO_F32_Log,) ),
+        ( "log1p32f", (MO_F32_Log1P,) ),
+        ( "exp32f", (MO_F32_Exp,) ),
+        ( "expM132f", (MO_F32_ExpM1,) ),
+        ( "fabs32f", (MO_F32_Fabs,) ),
+        ( "sqrt32f", (MO_F32_Sqrt,) ),
+
         ( "read_barrier", (MO_ReadBarrier,)),
         ( "write_barrier", (MO_WriteBarrier,)),
         ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ),
@@ -1060,10 +1099,6 @@ callishMachOps platform = listToUFM $
         ( "xchg16", (MO_Xchg W16,)),
         ( "xchg32", (MO_Xchg W32,)),
         ( "xchg64", (MO_Xchg W64,))
-
-        -- ToDo: the rest, maybe
-        -- edit: which rest?
-        -- also: how do we tell CMM Lint how to type check callish macops?
     ]
   where
     memcpyLikeTweakArgs :: (Int -> CallishMachOp) -> [CmmExpr] -> (CallishMachOp, [CmmExpr])
diff --git a/compiler/GHC/Cmm/Type.hs b/compiler/GHC/Cmm/Type.hs
index c7e2a4069b..1227b37ced 100644
--- a/compiler/GHC/Cmm/Type.hs
+++ b/compiler/GHC/Cmm/Type.hs
@@ -51,13 +51,14 @@ import Data.Int
 
 data CmmType    -- The important one!
   = CmmType CmmCat !Width
+  deriving Show
 
 data CmmCat                -- "Category" (not exported)
    = GcPtrCat              -- GC pointer
    | BitsCat               -- Non-pointer
    | FloatCat              -- Float
    | VecCat Length CmmCat  -- Vector
-   deriving( Eq )
+   deriving( Eq, Show )
         -- See Note [Signed vs unsigned] at the end
 
 instance Outputable CmmType where
@@ -434,4 +435,3 @@ C calling convention rather early on in the compiler).  However, given
 this, the cons outweigh the pros.
 
 -}
-
diff --git a/compiler/GHC/Cmm/Utils.hs b/compiler/GHC/Cmm/Utils.hs
index c1419cdd12..596b8d050f 100644
--- a/compiler/GHC/Cmm/Utils.hs
+++ b/compiler/GHC/Cmm/Utils.hs
@@ -14,7 +14,7 @@
 
 module GHC.Cmm.Utils(
         -- CmmType
-        primRepCmmType, slotCmmType, slotForeignHint,
+        primRepCmmType, slotCmmType,
         typeCmmType, typeForeignHint, primRepForeignHint,
 
         -- CmmLit
@@ -159,14 +159,6 @@ primRepForeignHint FloatRep     = NoHint
 primRepForeignHint DoubleRep    = NoHint
 primRepForeignHint (VecRep {})  = NoHint
 
-slotForeignHint :: SlotTy -> ForeignHint
-slotForeignHint PtrLiftedSlot   = AddrHint
-slotForeignHint PtrUnliftedSlot = AddrHint
-slotForeignHint WordSlot        = NoHint
-slotForeignHint Word64Slot      = NoHint
-slotForeignHint FloatSlot       = NoHint
-slotForeignHint DoubleSlot      = NoHint
-
 typeForeignHint :: UnaryType -> ForeignHint
 typeForeignHint = primRepForeignHint . typePrimRep1
 
diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs
index 00ef59660f..82122911b6 100644
--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -84,6 +84,7 @@ import GHC.Prelude
 import qualified GHC.CmmToAsm.X86   as X86
 import qualified GHC.CmmToAsm.PPC   as PPC
 import qualified GHC.CmmToAsm.SPARC as SPARC
+import qualified GHC.CmmToAsm.AArch64 as AArch64
 
 import GHC.CmmToAsm.Reg.Liveness
 import qualified GHC.CmmToAsm.Reg.Linear                as Linear
@@ -166,7 +167,7 @@ nativeCodeGen logger dflags this_mod modLoc h us cmms
       ArchSPARC64   -> panic "nativeCodeGen: No NCG for SPARC64"
       ArchS390X     -> panic "nativeCodeGen: No NCG for S390X"
       ArchARM {}    -> panic "nativeCodeGen: No NCG for ARM"
-      ArchAArch64   -> panic "nativeCodeGen: No NCG for AArch64"
+      ArchAArch64   -> nCG' (AArch64.ncgAArch64 config)
       ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
       ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
       ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
@@ -174,7 +175,6 @@ nativeCodeGen logger dflags this_mod modLoc h us cmms
       ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
       ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
 
-
 -- | Data accumulated during code generation. Mostly about statistics,
 -- but also collects debug data for DWARF generation.
 data NativeGenAcc statics instr
@@ -1191,9 +1191,9 @@ initNCGConfig dflags this_mod = NCGConfig
             ArchX86    -> v
             _          -> Nothing
 
-   , ncgDwarfEnabled        = debugLevel dflags >  0
-   , ncgDwarfUnwindings     = debugLevel dflags >= 1
+   , ncgDwarfEnabled        = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0 && platformArch (targetPlatform dflags) /= ArchAArch64
+   , ncgDwarfUnwindings     = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0
+   , ncgDwarfStripBlockInfo = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags < 2 -- We strip out block information when running with -g0 or -g1.
+   , ncgDwarfSourceNotes    = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 2 -- We produce GHC-specific source-note DIEs only with -g3
    , ncgExposeInternalSymbols = gopt Opt_ExposeInternalSymbols dflags
-   , ncgDwarfStripBlockInfo = debugLevel dflags <  2 -- We strip out block information when running with -g0 or -g1.
-   , ncgDwarfSourceNotes    = debugLevel dflags >= 3 -- We produce GHC-specific source-note DIEs only with -g3
    }
diff --git a/compiler/GHC/CmmToAsm/AArch64.hs b/compiler/GHC/CmmToAsm/AArch64.hs
new file mode 100644
index 0000000000..14f4b5d0bd
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64.hs
@@ -0,0 +1,60 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+-- | Native code generator for x86 and x86-64 architectures
+module GHC.CmmToAsm.AArch64
+   ( ncgAArch64 )
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+
+import qualified GHC.CmmToAsm.AArch64.Instr   as AArch64
+import qualified GHC.CmmToAsm.AArch64.Ppr     as AArch64
+import qualified GHC.CmmToAsm.AArch64.CodeGen as AArch64
+import qualified GHC.CmmToAsm.AArch64.Regs    as AArch64
+import qualified GHC.CmmToAsm.AArch64.RegInfo as AArch64
+
+ncgAArch64 :: NCGConfig -> NcgImpl RawCmmStatics AArch64.Instr AArch64.JumpDest
+ncgAArch64 config
+ = NcgImpl {
+        ncgConfig                 = config
+       ,cmmTopCodeGen             = AArch64.cmmTopCodeGen
+       ,generateJumpTableForInstr = AArch64.generateJumpTableForInstr config
+       ,getJumpDestBlockId        = AArch64.getJumpDestBlockId
+       ,canShortcut               = AArch64.canShortcut
+       ,shortcutStatics           = AArch64.shortcutStatics
+       ,shortcutJump              = AArch64.shortcutJump
+       ,pprNatCmmDecl             = AArch64.pprNatCmmDecl config
+       ,maxSpillSlots             = AArch64.maxSpillSlots config
+       ,allocatableRegs           = AArch64.allocatableRegs platform
+       ,ncgAllocMoreStack         = AArch64.allocMoreStack platform
+       ,ncgExpandTop              = id
+       ,ncgMakeFarBranches        = const id
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+  }
+    where
+      platform = ncgPlatform config
+
+-- | Instruction instance for aarch64
+instance Instruction AArch64.Instr where
+        regUsageOfInstr         = AArch64.regUsageOfInstr
+        patchRegsOfInstr        = AArch64.patchRegsOfInstr
+        isJumpishInstr          = AArch64.isJumpishInstr
+        jumpDestsOfInstr        = AArch64.jumpDestsOfInstr
+        patchJumpInstr          = AArch64.patchJumpInstr
+        mkSpillInstr            = AArch64.mkSpillInstr
+        mkLoadInstr             = AArch64.mkLoadInstr
+        takeDeltaInstr          = AArch64.takeDeltaInstr
+        isMetaInstr             = AArch64.isMetaInstr
+        mkRegRegMoveInstr _     = AArch64.mkRegRegMoveInstr
+        takeRegRegMoveInstr     = AArch64.takeRegRegMoveInstr
+        mkJumpInstr             = AArch64.mkJumpInstr
+        mkStackAllocInstr       = AArch64.mkStackAllocInstr
+        mkStackDeallocInstr     = AArch64.mkStackDeallocInstr
+        mkComment               = pure . AArch64.COMMENT
+        pprInstr                = AArch64.pprInstr
diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
new file mode 100644
index 0000000000..b0984070fc
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
@@ -0,0 +1,1358 @@
+{-# language GADTs #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BinaryLiterals #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NumericUnderscores #-}
+module GHC.CmmToAsm.AArch64.CodeGen (
+      cmmTopCodeGen
+    , generateJumpTableForInstr
+)
+
+where
+
+-- NCG stuff:
+import GHC.Prelude hiding (EQ)
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.CmmToAsm.AArch64.Cond
+
+import GHC.CmmToAsm.CPrim
+import GHC.Cmm.DebugBlock
+import GHC.CmmToAsm.Monad
+   ( NatM, getNewRegNat
+   , getPicBaseMaybeNat, getPlatform, getConfig
+   , getDebugBlock, getFileId
+   )
+-- import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.Platform.Reg
+import GHC.Platform
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Types.Tickish ( GenTickish(..) )
+import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
+
+-- The rest:
+import GHC.Data.OrdList
+import GHC.Utils.Outputable
+
+import Control.Monad    ( mapAndUnzipM, when, foldM )
+import Data.Word
+import Data.Maybe
+import GHC.Float
+
+import GHC.Types.Basic
+import GHC.Types.ForeignCall
+import GHC.Data.FastString
+import GHC.Utils.Misc
+import GHC.Utils.Panic
+
+-- Note [General layout of an NCG]
+-- @cmmTopCodeGen@ will be our main entry point to code gen.  Here we'll get
+-- @RawCmmDecl@; see GHC.Cmm
+--
+--   RawCmmDecl = GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
+--
+--   GenCmmDecl d h g = CmmProc h CLabel [GlobalReg] g
+--                    | CmmData Section d
+--
+-- As a result we want to transform this to a list of @NatCmmDecl@, which is
+-- defined @GHC.CmmToAsm.Instr@ as
+--
+--   type NatCmmDecl statics instr
+--        = GenCmmDecl statics (LabelMap RawCmmStatics) (ListGraph instr)
+--
+-- Thus well' turn
+--   GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) CmmGraph
+-- into
+--   [GenCmmDecl RawCmmStatics (LabelMap RawCmmStatics) (ListGraph Instr)]
+--
+-- where @CmmGraph@ is
+--
+--   type CmmGraph = GenCmmGraph CmmNode
+--   data GenCmmGraph n = CmmGraph { g_entry :: BlockId, g_graph :: Graph n C C }
+--   type CmmBlock = Block CmmNode C C
+--
+-- and @ListGraph Instr@ is
+--
+--   newtype ListGraph i = ListGraph [GenBasicBlock i]
+--   data GenBasicBlock i = BasicBlock BlockId [i]
+
+cmmTopCodeGen
+    :: RawCmmDecl
+    -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+-- Thus we'll have to deal with either CmmProc ...
+cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmProc) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  picBaseMb <- getPicBaseMaybeNat
+
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+
+  case picBaseMb of
+      Just _picBase -> panic "AArch64.cmmTopCodeGen: picBase not implemented"
+      Nothing -> return tops
+
+-- ... or CmmData.
+cmmTopCodeGen _cmm@(CmmData sec dat) = do
+  -- do
+  --   traceM $ "-- -------------------------- cmmTopGen (CmmData) -------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr cmm)
+  return [CmmData sec dat] -- no translation, we just use CmmStatic
+
+basicBlockCodeGen
+        :: Block CmmNode C C
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+  config <- getConfig
+  -- do
+  --   traceM $ "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+  --         ++ showSDocUnsafe (ppr block)
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+
+      header_comment_instr = unitOL $ MULTILINE_COMMENT (
+          text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
+          $+$ pdoc (ncgPlatform config) block
+          )
+  -- Generate location directive
+  dbg <- getDebugBlock (entryLabel block)
+  loc_instrs <- case dblSourceTick =<< dbg of
+    Just (SourceNote span name)
+      -> do fileId <- getFileId (srcSpanFile span)
+            let line = srcSpanStartLine span; col = srcSpanStartCol span
+            return $ unitOL $ LOCATION fileId line col name
+    _ -> return nilOL
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  -- TODO: Then x86 backend run @verifyBasicBlock@ here and inserts
+  --      unwinding info. See Ticket 19913
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+
+-- -----------------------------------------------------------------------------
+-- | Utilities
+ann :: SDoc -> Instr -> Instr
+ann doc instr {- | debugIsOn -} = ANN doc instr
+-- ann _ instr = instr
+{-# INLINE ann #-}
+
+-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
+-- -dppr-debug.  The idea is that we can trivially see how a cmm expression
+-- ended up producing the assmebly we see.  By having the verbatim AST printed
+-- we can simply check the patterns that were matched to arrive at the assmebly
+-- we generated.
+--
+-- pprExpr will hide a lot of noise of the underlying data structure and print
+-- the expression into something that can be easily read by a human. However
+-- going back to the exact CmmExpr representation can be labourous and adds
+-- indirections to find the matches that lead to the assembly.
+--
+-- An improvement oculd be to have
+--
+--    (pprExpr genericPlatform e) <> parens (text. show e)
+--
+-- to have the best of both worlds.
+--
+-- Note: debugIsOn is too restrictive, it only works for debug compilers.
+-- However, we do not only want to inspect this for debug compilers. Ideally
+-- we'd have a check for -dppr-debug here already, such that we don't even
+-- generate the ANN expressions. However, as they are lazy, they shouldn't be
+-- forced until we actually force them, and without -dppr-debug they should
+-- never end up being forced.
+annExpr :: CmmExpr -> Instr -> Instr
+annExpr e instr {- | debugIsOn -} = ANN (text . show $ e) instr
+-- annExpr e instr {- | debugIsOn -} = ANN (pprExpr genericPlatform e) instr
+-- annExpr _ instr = instr
+{-# INLINE annExpr #-}
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+-- TODO jump tables would be a lot faster, but we'll use bare bones for now.
+-- this is usually done by sticking the jump table ids into an instruction
+-- and then have the @generateJumpTableForInstr@ callback produce the jump
+-- table as a static.
+--
+-- See Ticket 19912
+--
+-- data SwitchTargets =
+--    SwitchTargets
+--        Bool                       -- Signed values
+--        (Integer, Integer)         -- Range
+--        (Maybe Label)              -- Default value
+--        (M.Map Integer Label)      -- The branches
+--
+-- Non Jumptable plan:
+-- xE <- expr
+--
+genSwitch :: CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch expr targets = do -- pprPanic "genSwitch" (ppr expr)
+  (reg, format, code) <- getSomeReg expr
+  let w = formatToWidth format
+  let mkbranch acc (key, bid) = do
+        (keyReg, _format, code) <- getSomeReg (CmmLit (CmmInt key w))
+        return $ code `appOL`
+                 toOL [ CMP (OpReg w reg) (OpReg w keyReg)
+                      , BCOND EQ (TBlock bid)
+                      ] `appOL` acc
+      def_code = case switchTargetsDefault targets of
+        Just bid -> unitOL (B (TBlock bid))
+        Nothing  -> nilOL
+
+  switch_code <- foldM mkbranch nilOL (switchTargetsCases targets)
+  return $ code `appOL` switch_code `appOL` def_code
+
+-- We don't do jump tables for now, see Ticket 19912
+generateJumpTableForInstr :: NCGConfig -> Instr
+  -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr _ _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Top-level of the instruction selector
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let !newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+--   if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+  -- traceM $ "-- -------------------------- stmtToInstrs -------------------------- --\n"
+  --     ++ showSDocUnsafe (ppr stmt)
+  platform <- getPlatform
+  case stmt of
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall target result_regs args bid
+
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT (ftext s)))
+      CmmTick {}     -> return nilOL
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType platform reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType platform src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _prediction ->
+          genCondBranch bid true false arg
+
+      CmmSwitch arg ids -> genSwitch arg ids
+
+      CmmCall { cml_target = arg } -> genJump arg
+
+      CmmUnwind _regs -> return nilOL
+
+      _ -> pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+-- | Sometimes we need to change the Format of a register. Primarily during
+-- conversion.
+swizzleRegisterRep :: Format -> Register -> Register
+swizzleRegisterRep format (Fixed _ reg code) = Fixed format reg code
+swizzleRegisterRep format (Any _ codefn)     = Any   format codefn
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+  = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence if it's not mapped to a registers something
+        -- went wrong earlier in the pipeline.
+-- | Convert a BlockId to some CmmStatic data
+-- TODO: Add JumpTable Logic, see Ticket 19912
+-- jumpTableEntry :: NCGConfig -> Maybe BlockId -> CmmStatic
+-- jumpTableEntry config Nothing   = CmmStaticLit (CmmInt 0 (ncgWordWidth config))
+-- jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+--     where blockLabel = blockLbl blockid
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, rep, code tmp)
+    Fixed rep reg code ->
+        return (reg, rep, code)
+
+-- TODO OPT: we might be able give getRegister
+--          a hint, what kind of register we want.
+getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
+getFloatReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code | isFloatFormat rep -> do
+      tmp <- getNewRegNat rep
+      return (tmp, rep, code tmp)
+    Any II32 code -> do
+      tmp <- getNewRegNat FF32
+      return (tmp, FF32, code tmp)
+    Any II64 code -> do
+      tmp <- getNewRegNat FF64
+      return (tmp, FF64, code tmp)
+    Any _w _code -> do
+      config <- getConfig
+      pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
+    -- can't do much for fixed.
+    Fixed rep reg code ->
+      return (reg, rep, code)
+
+-- TODO: TODO, bounds. We can't put any immediate
+-- value in. They are constrained.
+-- See Ticket 19911
+litToImm' :: CmmLit -> NatM (Operand, InstrBlock)
+litToImm' lit = return (OpImm (litToImm lit), nilOL)
+
+
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do
+  config <- getConfig
+  getRegister' config (ncgPlatform config) e
+
+-- Note [Handling PIC on AArch64]
+-- AArch64 does not have a special PIC register, the general approach is to
+-- simply go through the GOT, and there is assembly support for this:
+--
+--   // Load the address of 'sym' from the GOT using ADRP and LDR (used for
+--   // position-independent code on AArch64):
+--   adrp x0, #:got:sym
+--   ldr x0, [x0, #:got_lo12:sym]
+--
+-- See also: https://developer.arm.com/documentation/dui0774/i/armclang-integrated-assembler-directives/assembly-expressions
+--
+-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
+-- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
+-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
+-- callsites for this. One is in this module to produce the @target@ in @genCCall@
+-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
+--
+-- Conceptually we do not want any special PicBaseReg to be used on AArch64. If
+-- we want to distinguish between symbol loading, we need to address this through
+-- the way we load it, not through a register.
+--
+
+getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register
+-- OPTIMIZATION WARNING: CmmExpr rewrites
+-- 1. Rewrite: Reg + (-n) => Reg - n
+--    TODO: this expression souldn't even be generated to begin with.
+getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt (-i) w1)])
+
+getRegister' config plat (CmmMachOp (MO_Sub w0) [x, CmmLit (CmmInt i w1)]) | i < 0
+  = getRegister' config plat (CmmMachOp (MO_Add w0) [x, CmmLit (CmmInt (-i) w1)])
+
+
+-- Generic case.
+getRegister' config plat expr
+  = case expr of
+    CmmReg (CmmGlobal PicBaseReg)
+      -> pprPanic "getRegisterReg-memory" (ppr $ PicBaseReg)
+    CmmLit lit
+      -> case lit of
+
+        -- TODO handle CmmInt 0 specially, use wzr or xzr.
+
+        CmmInt i W8  -> do
+          return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowS W8 i))))))
+        CmmInt i W16 -> do
+          return (Any (intFormat W16) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger (narrowS W16 i))))))
+
+        -- We need to be careful to not shorten this for negative literals.
+        -- Those need the upper bits set. We'd either have to explicitly sign
+        -- or figure out something smarter. Lowered to
+        -- `MOV dst XZR`
+        CmmInt i w | isNbitEncodeable 16 i, i >= 0 -> do
+          return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger i)))))
+        CmmInt i w | isNbitEncodeable 32 i, i >= 0 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+          return (Any (intFormat w) (\dst -> toOL [ annExpr expr
+                                                  $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
+                                                  , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                  ]))
+        -- fallback for W32
+        CmmInt i W32 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+          return (Any (intFormat W32) (\dst -> toOL [ annExpr expr
+                                                    $ MOV (OpReg W32 dst) (OpImm (ImmInt half0))
+                                                    , MOVK (OpReg W32 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                    ]))
+        -- anything else
+        CmmInt i W64 -> do
+          let  half0 = fromIntegral (fromIntegral i :: Word16)
+               half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+               half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16)
+               half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16)
+          return (Any (intFormat W64) (\dst -> toOL [ annExpr expr
+                                                    $ MOV (OpReg W64 dst) (OpImm (ImmInt half0))
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half1) SLSL 16)
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half2) SLSL 32)
+                                                    , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half3) SLSL 48)
+                                                    ]))
+        CmmInt _i rep -> do
+          (op, imm_code) <- litToImm' lit
+          return (Any (intFormat rep) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg rep dst) op)))
+
+        -- floatToBytes (fromRational f)
+        CmmFloat 0 w   -> do
+          (op, imm_code) <- litToImm' lit
+          return (Any (floatFormat w) (\dst -> imm_code `snocOL` annExpr expr (MOV (OpReg w dst) op)))
+
+        CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
+        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)
+        CmmFloat f W32 -> do
+          let word = castFloatToWord32 (fromRational f) :: Word32
+              half0 = fromIntegral (fromIntegral word :: Word16)
+              half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
+          tmp <- getNewRegNat (intFormat W32)
+          return (Any (floatFormat W32) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W32 tmp) (OpImm (ImmInt half0))
+                                                      , MOVK (OpReg W32 tmp) (OpImmShift (ImmInt half1) SLSL 16)
+                                                      , MOV (OpReg W32 dst) (OpReg W32 tmp)
+                                                      ]))
+        CmmFloat f W64 -> do
+          let word = castDoubleToWord64 (fromRational f) :: Word64
+              half0 = fromIntegral (fromIntegral word :: Word16)
+              half1 = fromIntegral (fromIntegral (word `shiftR` 16) :: Word16)
+              half2 = fromIntegral (fromIntegral (word `shiftR` 32) :: Word16)
+              half3 = fromIntegral (fromIntegral (word `shiftR` 48) :: Word16)
+          tmp <- getNewRegNat (intFormat W64)
+          return (Any (floatFormat W64) (\dst -> toOL [ annExpr expr
+                                                      $ MOV (OpReg W64 tmp) (OpImm (ImmInt half0))
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half1) SLSL 16)
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half2) SLSL 32)
+                                                      , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half3) SLSL 48)
+                                                      , MOV (OpReg W64 dst) (OpReg W64 tmp)
+                                                      ]))
+        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
+        CmmVec _ -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)
+        CmmLabel _lbl -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+          return (Any format (\dst -> imm_code `snocOL` (annExpr expr $ LDR format (OpReg (formatToWidth format) dst) op)))
+
+        CmmLabelOff _lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
+          (op, imm_code) <- litToImm' lit
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+              -- width = typeWidth rep
+          return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
+
+        CmmLabelOff lbl off -> do
+          (op, imm_code) <- litToImm' (CmmLabel lbl)
+          let rep = cmmLitType plat lit
+              format = cmmTypeFormat rep
+              width = typeWidth rep
+          (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+          return (Any format (\dst -> imm_code `appOL` off_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op `snocOL` ADD (OpReg width dst) (OpReg width dst) (OpReg width off_r)))
+
+        CmmLabelDiffOff _ _ _ _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+        CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
+    CmmLoad mem rep -> do
+      Amode addr addr_code <- getAmode plat mem
+      let format = cmmTypeFormat rep
+      return (Any format (\dst -> addr_code `snocOL` LDR format (OpReg (formatToWidth format) dst) (OpAddr addr)))
+    CmmStackSlot _ _
+      -> pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)
+    CmmReg reg
+      -> return (Fixed (cmmTypeFormat (cmmRegType plat reg))
+                       (getRegisterReg plat reg)
+                       nilOL)
+    CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
+      getRegister' config plat $
+            CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+          where width = typeWidth (cmmRegType plat reg)
+
+    CmmRegOff reg off -> do
+      (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
+      (reg, _format, code) <- getSomeReg $ CmmReg reg
+      return $ Any (intFormat width) (\dst -> off_code `appOL` code `snocOL` ADD (OpReg width dst) (OpReg width reg) (OpReg width off_r))
+          where width = typeWidth (cmmRegType plat reg)
+
+
+
+    -- for MachOps, see GHC.Cmm.MachOp
+    -- For CmmMachOp, see GHC.Cmm.Expr
+    CmmMachOp op [e] -> do
+      (reg, _format, code) <- getSomeReg e
+      case op of
+        MO_Not w -> return $ Any (intFormat w) (\dst -> code `snocOL` MVN (OpReg w dst) (OpReg w reg))
+
+        MO_S_Neg w -> return $ Any (intFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
+        MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
+
+        MO_SF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
+        MO_FS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
+
+        -- TODO this is very hacky
+        -- Note, UBFM and SBFM expect source and target register to be of the same size, so we'll use @max from to@
+        -- UBFM will set the high bits to 0. SBFM will copy the sign (sign extend).
+        MO_UU_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` UBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
+        MO_SS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` SBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
+        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
+
+        -- Conversions
+        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
+
+        _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+      where toImm W8 =  (OpImm (ImmInt 7))
+            toImm W16 = (OpImm (ImmInt 15))
+            toImm W32 = (OpImm (ImmInt 31))
+            toImm W64 = (OpImm (ImmInt 63))
+            toImm W128 = (OpImm (ImmInt 127))
+            toImm W256 = (OpImm (ImmInt 255))
+            toImm W512 = (OpImm (ImmInt 511))
+    -- Dyadic machops:
+    --
+    -- The general idea is:
+    -- compute x<i> <- x
+    -- compute x<j> <- y
+    -- OP x<r>, x<i>, x<j>
+    --
+    -- TODO: for now we'll only implement the 64bit versions. And rely on the
+    --      fallthrough to alert us if things go wrong!
+    -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
+    -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
+    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    -- 1. Compute Reg +/- n directly.
+    --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
+    CmmMachOp (MO_Add w) [(CmmReg reg), CmmLit (CmmInt n _)]
+      | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ADD (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+    CmmMachOp (MO_Sub w) [(CmmReg reg), CmmLit (CmmInt n _)]
+      | n > 0 && n < 4096 -> return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (SUB (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      -- TODO: 12bits lsl #12; e.g. lower 12 bits of n are 0; shift n >> 12, and set lsl to #12.
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    -- 2. Shifts. x << n, x >> n.
+    CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSL (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+
+    CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W32, 0 <= n, n < 32 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+    CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W64, 0 <= n, n < 64 -> do
+      (reg_x, _format_x, code_x) <- getSomeReg x
+      return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger n))))
+
+    -- 3. Logic &&, ||
+    CmmMachOp (MO_And w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (AND (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    CmmMachOp (MO_Or w) [(CmmReg reg), CmmLit (CmmInt n _)] | isBitMaskImmediate (fromIntegral n) ->
+      return $ Any (intFormat w) (\d -> unitOL $ annExpr expr (ORR (OpReg w d) (OpReg w' r') (OpImm (ImmInteger n))))
+      where w' = formatToWidth (cmmTypeFormat (cmmRegType plat reg))
+            r' = getRegisterReg plat reg
+
+    -- Generic case.
+    CmmMachOp op [x, y] -> do
+      -- alright, so we have an operation, and two expressions. And we want to essentially do
+      -- ensure we get float regs
+      let genOp w op = do
+            (reg_x, format_x, code_x) <- getSomeReg x
+            (reg_y, format_y, code_y) <- getSomeReg y
+            when ((isFloatFormat format_x && isIntFormat format_y) || (isIntFormat format_x && isFloatFormat format_y)) $ pprPanic "getRegister:genOp" (text "formats don't match:" <+> text (show format_x) <+> text "/=" <+> text (show format_y))
+            return $ Any format_x (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+
+          withTempIntReg w op = OpReg w <$> getNewRegNat (intFormat w) >>= op
+          -- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
+
+          intOp w op = do
+            -- compute x<m> <- x
+            -- compute x<o> <- y
+            -- <OP> x<n>, x<m>, x<o>
+            (reg_x, _format_x, code_x) <- getSomeReg x
+            (reg_y, _format_y, code_y) <- getSomeReg y
+            return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+          floatOp w op = do
+            (reg_fx, _format_x, code_fx) <- getFloatReg x
+            (reg_fy, _format_y, code_fy) <- getFloatReg y
+            return $ Any (floatFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+          -- need a special one for conditionals, as they return ints
+          floatCond w op = do
+            (reg_fx, _format_x, code_fx) <- getFloatReg x
+            (reg_fy, _format_y, code_fy) <- getFloatReg y
+            return $ Any (intFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))
+
+      case op of
+        -- Integer operations
+        -- Add/Sub should only be Interger Options.
+        -- But our Cmm parser doesn't care about types
+        -- and thus we end up with <float> + <float> => MO_Add <float> <float>
+        MO_Add w -> genOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
+        MO_Sub w -> genOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
+        --    31  30  29  28
+        --  .---+---+---+---+-- - -
+        --  | N | Z | C | V |
+        --  '---+---+---+---+-- - -
+        --  Negative
+        --  Zero
+        --  Carry
+        --  oVerflow
+        --
+        --  .------+-------------------------------------+-----------------+----------.
+        --  | Code | Meaning                             | Flags           | Encoding |
+        --  |------+-------------------------------------+-----------------+----------|
+        --  |  EQ  | Equal                               | Z = 1           | 0000     |
+        --  |  NE  | Not Equal                           | Z = 0           | 0001     |
+        --  |  HI  | Unsigned Higher                     | C = 1 && Z = 0  | 1000     |
+        --  |  HS  | Unsigned Higher or Same             | C = 1           | 0010     |
+        --  |  LS  | Unsigned Lower or Same              | C = 0 || Z = 1  | 1001     |
+        --  |  LO  | Unsigned Lower                      | C = 0           | 0011     |
+        --  |  GT  | Signed Greater Than                 | Z = 0 && N = V  | 1100     |
+        --  |  GE  | Signed Greater Than or Equal        | N = V           | 1010     |
+        --  |  LE  | Signed Less Than or Equal           | Z = 1 || N /= V | 1101     |
+        --  |  LT  | Signed Less Than                    | N /= V          | 1011     |
+        --  |  CS  | Carry Set (Unsigned Overflow)       | C = 1           | 0010     |
+        --  |  CC  | Carry Clear (No Unsigned Overflow)  | C = 0           | 0011     |
+        --  |  VS  | Signed Overflow                     | V = 1           | 0110     |
+        --  |  VC  | No Signed Overflow                  | V = 0           | 0111     |
+        --  |  MI  | Minus, Negative                     | N = 1           | 0100     |
+        --  |  PL  | Plus, Positive or Zero (!)          | N = 0           | 0101     |
+        --  |  AL  | Always                              | Any             | 1110     |
+        --  |  NV  | Never                               | Any             | 1111     |
+        --- '-------------------------------------------------------------------------'
+
+        MO_Eq w -> intOp w (\d x y -> toOL [ CMP x y, CSET d EQ ])
+        MO_Ne w -> intOp w (\d x y -> toOL [ CMP x y, CSET d NE ])
+        MO_Mul w -> intOp w (\d x y -> unitOL $ MUL d x y)
+
+        -- Signed multiply/divide
+        MO_S_MulMayOflo w -> intOp w (\d x y -> toOL [ MUL d x y, CSET d VS ])
+        MO_S_Quot w -> intOp w (\d x y -> unitOL $ SDIV d x y)
+
+        -- No native rem instruction. So we'll compute the following
+        -- Rd  <- Rx / Ry             | 2 <- 7 / 3      -- SDIV Rd Rx Ry
+        -- Rd' <- Rx - Rd * Ry        | 1 <- 7 - 2 * 3  -- MSUB Rd' Rd Ry Rx
+        --        |     '---|----------------|---'   |
+        --        |         '----------------|-------'
+        --        '--------------------------'
+        -- Note the swap in Rx and Ry.
+        MO_S_Rem w -> withTempIntReg w $ \t ->
+          intOp w (\d x y -> toOL [ SDIV t x y, MSUB d t y x ])
+
+        -- Unsigned multiply/divide
+        MO_U_MulMayOflo _w -> unsupportedP plat expr
+        MO_U_Quot w -> intOp w (\d x y -> unitOL $ UDIV d x y)
+        MO_U_Rem w  -> withTempIntReg w $ \t ->
+          intOp w (\d x y -> toOL [ UDIV t x y, MSUB d t y x ])
+
+        -- Signed comparisons -- see above for the CSET discussion
+        MO_S_Ge w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SGE ])
+        MO_S_Le w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SLE ])
+        MO_S_Gt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SGT ])
+        MO_S_Lt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d SLT ])
+
+        -- Unsigned comparisons
+        MO_U_Ge w -> intOp w (\d x y -> toOL [ CMP x y, CSET d UGE ])
+        MO_U_Le w -> intOp w (\d x y -> toOL [ CMP x y, CSET d ULE ])
+        MO_U_Gt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d UGT ])
+        MO_U_Lt w -> intOp w (\d x y -> toOL [ CMP x y, CSET d ULT ])
+
+        -- Floating point arithmetic
+        MO_F_Add w   -> floatOp w (\d x y -> unitOL $ ADD d x y)
+        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
+        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
+        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+
+        -- Floating point comparison
+        MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
+        MO_F_Ne w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d NE ])
+
+        -- careful with the floating point operations.
+        -- SLE is effectively LE or unordered (NaN)
+        -- SLT is the same. ULE, and ULT will not return true for NaN.
+        -- This is a bit counter intutive. Don't let yourself be fooled by
+        -- the S/U prefix for floats, it's only meaningful for integers.
+        MO_F_Ge w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGE ])
+        MO_F_Le w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLE ]) -- x <= y <=> y > x
+        MO_F_Gt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OGT ])
+        MO_F_Lt w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLT ]) -- x < y <=> y >= x
+
+        -- Bitwise operations
+        MO_And   w -> intOp w (\d x y -> unitOL $ AND d x y)
+        MO_Or    w -> intOp w (\d x y -> unitOL $ ORR d x y)
+        MO_Xor   w -> intOp w (\d x y -> unitOL $ EOR d x y)
+        -- MO_Not   W64 ->
+        MO_Shl   w -> intOp w (\d x y -> unitOL $ LSL d x y)
+        MO_U_Shr w -> intOp w (\d x y -> unitOL $ LSR d x y)
+        MO_S_Shr w -> intOp w (\d x y -> unitOL $ ASR d x y)
+
+        -- TODO
+
+        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+    CmmMachOp _op _xs
+      -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
+
+  where
+    unsupportedP :: OutputableP env a => env -> a -> b
+    unsupportedP platform op = pprPanic "Unsupported op:" (pdoc platform op)
+
+    isNbitEncodeable :: Int -> Integer -> Bool
+    isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
+    -- This needs to check if n can be encoded as a bitmask immediate:
+    --
+    -- See https://stackoverflow.com/questions/30904718/range-of-immediate-values-in-armv8-a64-assembly
+    --
+    isBitMaskImmediate :: Integer -> Bool
+    isBitMaskImmediate i = i `elem` [0b0000_0001, 0b0000_0010, 0b0000_0100, 0b0000_1000, 0b0001_0000, 0b0010_0000, 0b0100_0000, 0b1000_0000
+                                    ,0b0000_0011, 0b0000_0110, 0b0000_1100, 0b0001_1000, 0b0011_0000, 0b0110_0000, 0b1100_0000
+                                    ,0b0000_0111, 0b0000_1110, 0b0001_1100, 0b0011_1000, 0b0111_0000, 0b1110_0000
+                                    ,0b0000_1111, 0b0001_1110, 0b0011_1100, 0b0111_1000, 0b1111_0000
+                                    ,0b0001_1111, 0b0011_1110, 0b0111_1100, 0b1111_1000
+                                    ,0b0011_1111, 0b0111_1110, 0b1111_1100
+                                    ,0b0111_1111, 0b1111_1110
+                                    ,0b1111_1111]
+
+
+-- -----------------------------------------------------------------------------
+--  The 'Amode' type: Memory addressing modes passed up the tree.
+data Amode = Amode AddrMode InstrBlock
+
+getAmode :: Platform -> CmmExpr -> NatM Amode
+-- TODO: Specialize stuff we can destructure here.
+
+-- OPTIMIZATION WARNING: Addressing modes.
+-- Addressing options:
+-- LDUR/STUR: imm9: -256 - 255
+getAmode platform (CmmRegOff reg off) | -256 <= off, off <= 255
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+-- LDR/STR: imm12: if reg is 32bit: 0 -- 16380 in multiples of 4
+getAmode platform (CmmRegOff reg off)
+  | typeWidth (cmmRegType platform reg) == W32, 0 <= off, off <= 16380, off `mod` 4 == 0
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+-- LDR/STR: imm12: if reg is 64bit: 0 -- 32760 in multiples of 8
+getAmode platform (CmmRegOff reg off)
+  | typeWidth (cmmRegType platform reg) == W64, 0 <= off, off <= 32760, off `mod` 8 == 0
+  = return $ Amode (AddrRegImm reg' off') nilOL
+    where reg' = getRegisterReg platform reg
+          off' = ImmInt off
+
+-- For Stores we often see something like this:
+-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
+-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
+-- for `n` in range.
+getAmode _platform (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
+  | -256 <= off, off <= 255
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger off)) code
+
+getAmode _platform (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
+  | -256 <= -off, -off <= 255
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
+
+-- Generic case
+getAmode _platform expr
+  = do (reg, _format, code) <- getSomeReg expr
+       return $ Amode (AddrReg reg) code
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_IntCode rep addrE srcE
+  = do
+    (src_reg, _format, code) <- getSomeReg srcE
+    platform <- getPlatform
+    Amode addr addr_code <- getAmode platform addrE
+    return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
+            `consOL` (code
+            `appOL` addr_code
+            `snocOL` STR rep (OpReg (formatToWidth rep) src_reg) (OpAddr addr))
+
+assignReg_IntCode _ reg src
+  = do
+    platform <- getPlatform
+    let dst = getRegisterReg platform reg
+    r <- getRegister src
+    return $ case r of
+      Any _ code              -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` code dst
+      Fixed format freg fcode -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` (fcode `snocOL` MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg))
+
+-- Let's treat Floating point stuff
+-- as integer code for now. Opaque.
+assignMem_FltCode = assignMem_IntCode
+assignReg_FltCode = assignReg_IntCode
+
+-- -----------------------------------------------------------------------------
+-- Jumps
+genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+genJump expr@(CmmLit (CmmLabel lbl))
+  = return $ unitOL (annExpr expr (J (TLabel lbl)))
+
+genJump expr = do
+    (target, _format, code) <- getSomeReg expr
+    return (code `appOL` unitOL (annExpr expr (J (TReg target))))
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+-- -----------------------------------------------------------------------------
+-- Conditional branches
+genCondJump
+    :: BlockId
+    -> CmmExpr
+    -> NatM InstrBlock
+genCondJump bid expr = do
+    case expr of
+      -- Optimized == 0 case.
+      CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL` (annExpr expr (CBZ (OpReg w reg_x) (TBlock bid)))
+
+      -- Optimized /= 0 case.
+      CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)] -> do
+        (reg_x, _format_x, code_x) <- getSomeReg x
+        return $ code_x `snocOL`  (annExpr expr (CBNZ (OpReg w reg_x) (TBlock bid)))
+
+      -- Generic case.
+      CmmMachOp mop [x, y] -> do
+
+        let bcond w cmp = do
+              -- compute both sides.
+              (reg_x, _format_x, code_x) <- getSomeReg x
+              (reg_y, _format_y, code_y) <- getSomeReg y
+              return $ code_x `appOL` code_y `snocOL` CMP (OpReg w reg_x) (OpReg w reg_y) `snocOL` (annExpr expr (BCOND cmp (TBlock bid)))
+            fbcond w cmp = do
+              -- ensure we get float regs
+              (reg_fx, _format_fx, code_fx) <- getFloatReg x
+              (reg_fy, _format_fy, code_fy) <- getFloatReg y
+              return $ code_fx `appOL` code_fy `snocOL` CMP (OpReg w reg_fx) (OpReg w reg_fy) `snocOL` (annExpr expr (BCOND cmp (TBlock bid)))
+
+        case mop of
+          MO_F_Eq w -> fbcond w EQ
+          MO_F_Ne w -> fbcond w NE
+
+          MO_F_Gt w -> fbcond w OGT
+          MO_F_Ge w -> fbcond w OGE
+          MO_F_Lt w -> fbcond w OLT
+          MO_F_Le w -> fbcond w OLE
+
+          MO_Eq w   -> bcond w EQ
+          MO_Ne w   -> bcond w NE
+
+          MO_S_Gt w -> bcond w SGT
+          MO_S_Ge w -> bcond w SGE
+          MO_S_Lt w -> bcond w SLT
+          MO_S_Le w -> bcond w SLE
+          MO_U_Gt w -> bcond w UGT
+          MO_U_Ge w -> bcond w UGE
+          MO_U_Lt w -> bcond w ULT
+          MO_U_Le w -> bcond w ULE
+          _ -> pprPanic "AArch64.genCondJump:case mop: " (text $ show expr)
+      _ -> pprPanic "AArch64.genCondJump: " (text $ show expr)
+
+
+genCondBranch
+    :: BlockId      -- the source of the jump
+    -> BlockId      -- the true branch target
+    -> BlockId      -- the false branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock -- Instructions
+
+genCondBranch _ true false expr = do
+  b1 <- genCondJump true expr
+  b2 <- genBranch false
+  return (b1 `appOL` b2)
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+--
+-- As per *convention*:
+-- x0-x7:   (volatile) argument registers
+-- x8:      (volatile) indirect result register / Linux syscall no
+-- x9-x15:  (volatile) caller saved regs
+-- x16,x17: (volatile) intra-procedure-call registers
+-- x18:     (volatile) platform register. don't use for portability
+-- x19-x28: (non-volatile) callee save regs
+-- x29:     (non-volatile) frame pointer
+-- x30:                    link register
+-- x31:                    stack pointer / zero reg
+--
+-- Thus, this is what a c function will expect. Find the arguments in x0-x7,
+-- anything above that on the stack.  We'll ignore c functions with more than
+-- 8 arguments for now.  Sorry.
+--
+-- We need to make sure we preserve x9-x15, don't want to touch x16, x17.
+
+-- Note [PLT vs GOT relocations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- When linking objects together, we may need to lookup foreign references. That
+-- is symbolic references to functions or values in other objects. When
+-- compiling the object, we can not know where those elements will end up in
+-- memory (relative to the current location). Thus the use of symbols. There
+-- are two types of items we are interested, code segments we want to jump to
+-- and continue execution there (functions, ...), and data items we want to look
+-- up (strings, numbers, ...). For functions we can use the fact that we can use
+-- an intermediate jump without visibility to the programs execution.  If we
+-- want to jump to a function that is simply too far away to reach for the B/BL
+-- instruction, we can create a small piece of code that loads the full target
+-- address and jumps to that on demand. Say f wants to call g, however g is out
+-- of range for a direct jump, we can create a function h in range for f, that
+-- will load the address of g, and jump there. The area where we construct h
+-- is called the Procedure Linking Table (PLT), we have essentially replaced
+-- f -> g with f -> h -> g.  This is fine for function calls.  However if we
+-- want to lookup values, this trick doesn't work, so we need something else.
+-- We will instead reserve a slot in memory, and have a symbol pointing to that
+-- slot. Now what we essentially do is, we reference that slot, and expect that
+-- slot to hold the final resting address of the data we are interested in.
+-- Thus what that symbol really points to is the location of the final data.
+-- The block of memory where we hold all those slots is the Global Offset Table
+-- (GOT).  Instead of x <- $foo, we now do y <- $fooPtr, and x <- [$y].
+--
+-- For JUMP/CALLs we have 26bits (+/- 128MB), for conditional branches we only
+-- have 19bits (+/- 1MB).  Symbol lookups are also within +/- 1MB, thus for most
+-- of the LOAD/STOREs we'd want to use adrp, and add to compute a value within
+-- 4GB of the PC, and load that.  For anything outside of that range, we'd have
+-- to go through the GOT.
+--
+--  adrp x0, <symbol>
+--  add x0, :lo:<symbol>
+--
+-- will compute the address of <symbol> int x0 if <symbol> is within 4GB of the
+-- PC.
+--
+-- If we want to get the slot in the global offset table (GOT), we can do this:
+--
+--   adrp x0, #:got:<symbol>
+--   ldr x0, [x0, #:got_lo12:<symbol>]
+--
+-- this will compute the address anywhere in the addressable 64bit space into
+-- x0, by loading the address from the GOT slot.
+--
+-- To actually get the value of <symbol>, we'd need to ldr x0, x0 still, which
+-- for the first case can be optimized to use ldr x0, [x0, #:lo12:<symbol>]
+-- instaed of the add instruction.
+--
+-- As the memory model for AArch64 for PIC is considered to be +/- 4GB, we do
+-- not need to go through the GOT, unless we want to address the full address
+-- range within 64bit.
+
+genCCall
+    :: ForeignTarget      -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId            -- The block we are in
+    -> NatM (InstrBlock, Maybe BlockId)
+-- TODO: Specialize where we can.
+-- Generic impl
+genCCall target dest_regs arg_regs bid = do
+  -- we want to pass arg_regs into allArgRegs
+  -- pprTraceM "genCCall target" (ppr target)
+  -- pprTraceM "genCCall formal" (ppr dest_regs)
+  -- pprTraceM "genCCall actual" (ppr arg_regs)
+
+  case target of
+    -- The target :: ForeignTarget call can either
+    -- be a foreign procedure with an address expr
+    -- and a calling convention.
+    ForeignTarget expr _cconv -> do
+      (call_target, call_target_code) <- case expr of
+        -- if this is a label, let's just directly to it.  This will produce the
+        -- correct CALL relocation for BL...
+        (CmmLit (CmmLabel lbl)) -> pure (TLabel lbl, nilOL)
+        -- ... if it's not a label--well--let's compute the expression into a
+        -- register and jump to that. See Note [PLT vs GOT relocations]
+        _ -> do (reg, _format, reg_code) <- getSomeReg expr
+                pure (TReg reg, reg_code)
+      -- compute the code and register logic for all arg_regs.
+      -- this will give us the format information to match on.
+      arg_regs' <- mapM getSomeReg arg_regs
+
+      -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
+      -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
+      -- STG; this thenn breaks packing of stack arguments, if we need to pack
+      -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
+      -- in Cmm proper. Option two, which we choose here is to use extended Hint
+      -- information to contain the size information and use that when packing
+      -- arguments, spilled onto the stack.
+      let (_res_hints, arg_hints) = foreignTargetHints target
+          arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints
+
+      platform <- getPlatform
+      let packStack = platformOS platform == OSDarwin
+
+      (stackSpace', passRegs, passArgumentsCode) <- passArguments packStack allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL
+
+      -- if we pack the stack, we may need to adjust to multiple of 8byte.
+      -- if we don't pack the stack, it will always be multiple of 8.
+      let stackSpace = if stackSpace' `mod` 8 /= 0
+                       then 8 * (stackSpace' `div` 8 + 1)
+                       else stackSpace'
+
+      (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
+
+      let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
+                                 , DELTA (-16) ]
+          moveStackDown i | odd i = moveStackDown (i + 1)
+          moveStackDown i = toOL [ PUSH_STACK_FRAME
+                                 , SUB (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
+                                 , DELTA (-8 * i - 16) ]
+          moveStackUp 0 = toOL [ POP_STACK_FRAME
+                               , DELTA 0 ]
+          moveStackUp i | odd i = moveStackUp (i + 1)
+          moveStackUp i = toOL [ ADD (OpReg W64 (regSingle 31)) (OpReg W64 (regSingle 31)) (OpImm (ImmInt (8 * i)))
+                               , POP_STACK_FRAME
+                               , DELTA 0 ]
+
+      let code =    call_target_code          -- compute the label (possibly into a register)
+            `appOL` moveStackDown (stackSpace `div` 8)
+            `appOL` passArgumentsCode         -- put the arguments into x0, ...
+            `appOL` (unitOL $ BL call_target passRegs returnRegs) -- branch and link.
+            `appOL` readResultsCode           -- parse the results into registers
+            `appOL` moveStackUp (stackSpace `div` 8)
+      return (code, Nothing)
+
+    -- or a possibly side-effecting machine operation
+    -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
+    PrimTarget mop -> do
+      -- We'll need config to construct forien targets
+      case mop of
+        -- 64 bit float ops
+        MO_F64_Pwr   -> mkCCall "pow"
+
+        MO_F64_Sin   -> mkCCall "sin"
+        MO_F64_Cos   -> mkCCall "cos"
+        MO_F64_Tan   -> mkCCall "tan"
+
+        MO_F64_Sinh  -> mkCCall "sinh"
+        MO_F64_Cosh  -> mkCCall "cosh"
+        MO_F64_Tanh  -> mkCCall "tanh"
+
+        MO_F64_Asin  -> mkCCall "asin"
+        MO_F64_Acos  -> mkCCall "acos"
+        MO_F64_Atan  -> mkCCall "atan"
+
+        MO_F64_Asinh -> mkCCall "asinh"
+        MO_F64_Acosh -> mkCCall "acosh"
+        MO_F64_Atanh -> mkCCall "atanh"
+
+        MO_F64_Log   -> mkCCall "log"
+        MO_F64_Log1P -> mkCCall "log1p"
+        MO_F64_Exp   -> mkCCall "exp"
+        MO_F64_ExpM1 -> mkCCall "expm1"
+        MO_F64_Fabs  -> mkCCall "fabs"
+        MO_F64_Sqrt  -> mkCCall "sqrt"
+
+        -- 32 bit float ops
+        MO_F32_Pwr   -> mkCCall "powf"
+
+        MO_F32_Sin   -> mkCCall "sinf"
+        MO_F32_Cos   -> mkCCall "cosf"
+        MO_F32_Tan   -> mkCCall "tanf"
+        MO_F32_Sinh  -> mkCCall "sinhf"
+        MO_F32_Cosh  -> mkCCall "coshf"
+        MO_F32_Tanh  -> mkCCall "tanhf"
+        MO_F32_Asin  -> mkCCall "asinf"
+        MO_F32_Acos  -> mkCCall "acosf"
+        MO_F32_Atan  -> mkCCall "atanf"
+        MO_F32_Asinh -> mkCCall "asinhf"
+        MO_F32_Acosh -> mkCCall "acoshf"
+        MO_F32_Atanh -> mkCCall "atanhf"
+        MO_F32_Log   -> mkCCall "logf"
+        MO_F32_Log1P -> mkCCall "log1pf"
+        MO_F32_Exp   -> mkCCall "expf"
+        MO_F32_ExpM1 -> mkCCall "expm1f"
+        MO_F32_Fabs  -> mkCCall "fasbf"
+        MO_F32_Sqrt  -> mkCCall "sqrtf"
+
+        -- Conversion
+        MO_UF_Conv w        -> mkCCall (word2FloatLabel w)
+
+        -- Arithmatic
+        -- These are not supported on X86, so I doubt they are used much.
+        MO_S_Mul2     _w -> unsupported mop
+        MO_S_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem  _w -> unsupported mop
+        MO_U_QuotRem2 _w -> unsupported mop
+        MO_Add2       _w -> unsupported mop
+        MO_AddWordC   _w -> unsupported mop
+        MO_SubWordC   _w -> unsupported mop
+        MO_AddIntC    _w -> unsupported mop
+        MO_SubIntC    _w -> unsupported mop
+        MO_U_Mul2     _w -> unsupported mop
+
+        -- Memory Ordering
+        -- TODO DMBSY is probably *way* too much!
+        MO_ReadBarrier      ->  return (unitOL DMBSY, Nothing)
+        MO_WriteBarrier     ->  return (unitOL DMBSY, Nothing)
+        MO_Touch            ->  return (nilOL, Nothing) -- Keep variables live (when using interior pointers)
+        -- Prefetch
+        MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint.
+
+        -- Memory copy/set/move/cmp, with alignment for optimization
+
+        -- TODO Optimize and use e.g. quad registers to move memory around instead
+        -- of offloading this to memcpy. For small memcpys we can utilize
+        -- the 128bit quad registers in NEON to move block of bytes around.
+        -- Might also make sense of small memsets? Use xzr? What's the function
+        -- call overhead?
+        MO_Memcpy  _align   -> mkCCall "memcpy"
+        MO_Memset  _align   -> mkCCall "memset"
+        MO_Memmove _align   -> mkCCall "memmove"
+        MO_Memcmp  _align   -> mkCCall "memcmp"
+
+        MO_SuspendThread    -> mkCCall "suspendThread"
+        MO_ResumeThread     -> mkCCall "resumeThread"
+
+        MO_PopCnt w         -> mkCCall (popCntLabel w)
+        MO_Pdep w           -> mkCCall (pdepLabel w)
+        MO_Pext w           -> mkCCall (pextLabel w)
+        MO_Clz w            -> mkCCall (clzLabel w)
+        MO_Ctz w            -> mkCCall (ctzLabel w)
+        MO_BSwap w          -> mkCCall (bSwapLabel w)
+        MO_BRev w           -> mkCCall (bRevLabel w)
+
+        -- -- Atomic read-modify-write.
+        MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
+        MO_AtomicRead w     -> mkCCall (atomicReadLabel w)
+        MO_AtomicWrite w    -> mkCCall (atomicWriteLabel w)
+        MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
+        -- -- Should be an AtomicRMW variant eventually.
+        -- -- Sequential consistent.
+        -- TODO: this should be implemented properly!
+        MO_Xchg w           -> mkCCall (xchgLabel w)
+
+  where
+    unsupported :: Show a => a -> b
+    unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
+                          ++ " not supported here")
+    mkCCall :: FastString -> NatM (InstrBlock, Maybe BlockId)
+    mkCCall name = do
+      config <- getConfig
+      target <- cmmMakeDynamicReference config CallReference $
+          mkForeignLabel name Nothing ForeignLabelInThisPackage IsFunction
+      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
+      genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
+
+    -- TODO: Optimize using paired stores and loads (STP, LDP). It is
+    -- automomatically done by the allocator for us. However it's not optimal,
+    -- as we'd rather want to have control over
+    --     all spill/load registers, so we can optimize with instructions like
+    --       STP xA, xB, [sp, #-16]!
+    --     and
+    --       LDP xA, xB, sp, #16
+    --
+    passArguments :: Bool -> [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
+    passArguments _packStack _ _ [] stackSpace accumRegs accumCode = return (stackSpace, accumRegs, accumCode)
+    -- passArguments _ _ [] accumCode stackSpace | isEven stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * stackSpace))
+    -- passArguments _ _ [] accumCode stackSpace = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * (stackSpace + 1)))
+    -- passArguments [] fpRegs (arg0:arg1:args) stack accumCode = do
+    --   -- allocate this on the stack
+    --   (r0, format0, code_r0) <- getSomeReg arg0
+    --   (r1, format1, code_r1) <- getSomeReg arg1
+    --   let w0 = formatToWidth format0
+    --       w1 = formatToWidth format1
+    --       stackCode = unitOL $ STP (OpReg w0 r0) (OpReg w1 R1), (OpAddr (AddrRegImm x31 (ImmInt (stackSpace * 8)))
+    --   passArguments gpRegs (fpReg:fpRegs) args (stackCode `appOL` accumCode)
+
+      -- float promotion.
+      -- According to
+      --  ISO/IEC 9899:2018
+      --  Information technology — Programming languages — C
+      --
+      -- e.g.
+      -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf
+      -- http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf
+      --
+      -- GHC would need to know the prototype.
+      --
+      -- > If the expression that denotes the called function has a type that does not include a
+      -- > prototype, the integer promotions are performed on each argument, and arguments that
+      -- > have type float are promoted to double.
+      --
+      -- As we have no way to get prototypes for C yet, we'll *not* promote this
+      -- which is in line with the x86_64 backend :(
+      --
+      -- See the encode_values.cmm test.
+      --
+      -- We would essentially need to insert an FCVT (OpReg W64 fpReg) (OpReg W32 fpReg)
+      -- if w == W32.  But *only* if we don't have a prototype m(
+      --
+      -- For AArch64 specificies see: https://developer.arm.com/docs/ihi0055/latest/procedure-call-standard-for-the-arm-64-bit-architecture
+      --
+    -- Still have GP regs, and we want to pass an GP argument.
+    passArguments pack (gpReg:gpRegs) fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      let w = formatToWidth format
+      passArguments pack gpRegs fpRegs args stackSpace (gpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass gp argument: " <> ppr r) $ MOV (OpReg w gpReg) (OpReg w r)))
+
+    -- Still have FP regs, and we want to pass an FP argument.
+    passArguments pack gpRegs (fpReg:fpRegs) ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+      passArguments pack gpRegs fpRegs args stackSpace (fpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ann (text "Pass fp argument: " <> ppr r) $ MOV (OpReg w fpReg) (OpReg w r)))
+
+    -- No mor regs left to pass. Must pass on stack.
+    passArguments pack [] [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack [] [] args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
+    passArguments pack [] fpRegs ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isIntFormat format = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack [] fpRegs args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    -- Still have gpRegs left, but want to pass a FP argument. Must be passed on the stack then.
+    passArguments pack gpRegs [] ((r, format, _hint, code_r):args) stackSpace accumRegs accumCode | isFloatFormat format = do
+      let w = formatToWidth format
+          bytes = widthInBits w `div` 8
+          space = if pack then bytes else 8
+          stackCode = code_r `snocOL` (ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt stackSpace))))
+      passArguments pack gpRegs [] args (stackSpace+space) accumRegs (stackCode `appOL` accumCode)
+
+    passArguments _ _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
+
+    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
+    readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
+    readResults [] _ _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
+    readResults _ [] _ _ _ = do
+      platform <- getPlatform
+      pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
+    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
+      -- gp/fp reg -> dst
+      platform <- getPlatform
+      let rep = cmmRegType platform (CmmLocal dst)
+          format = cmmTypeFormat rep
+          w   = cmmRegWidth platform (CmmLocal dst)
+          r_dst = getRegisterReg platform (CmmLocal dst)
+      if isFloatFormat format
+        then readResults (gpReg:gpRegs) fpRegs dsts (fpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
+        else readResults gpRegs (fpReg:fpRegs) dsts (gpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg))
diff --git a/compiler/GHC/CmmToAsm/AArch64/Cond.hs b/compiler/GHC/CmmToAsm/AArch64/Cond.hs
new file mode 100644
index 0000000000..687daccfda
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Cond.hs
@@ -0,0 +1,66 @@
+module GHC.CmmToAsm.AArch64.Cond  where
+
+import GHC.Prelude
+
+-- https://developer.arm.com/documentation/den0024/a/the-a64-instruction-set/data-processing-instructions/conditional-instructions
+
+-- TODO: This appears to go a bit overboard? Maybe we should stick with what LLVM
+-- settled on for fcmp?
+-- false: always yields false, regardless of operands.
+-- oeq: yields true if both operands are not a QNAN and op1 is equal to op2.
+-- ogt: yields true if both operands are not a QNAN and op1 is greater than op2.
+-- oge: yields true if both operands are not a QNAN and op1 is greater than or equal to op2.
+-- olt: yields true if both operands are not a QNAN and op1 is less than op2.
+-- ole: yields true if both operands are not a QNAN and op1 is less than or equal to op2.
+-- one: yields true if both operands are not a QNAN and op1 is not equal to op2.
+-- ord: yields true if both operands are not a QNAN.
+-- ueq: yields true if either operand is a QNAN or op1 is equal to op2.
+-- ugt: yields true if either operand is a QNAN or op1 is greater than op2.
+-- uge: yields true if either operand is a QNAN or op1 is greater than or equal to op2.
+-- ult: yields true if either operand is a QNAN or op1 is less than op2.
+-- ule: yields true if either operand is a QNAN or op1 is less than or equal to op2.
+-- une: yields true if either operand is a QNAN or op1 is not equal to op2.
+-- uno: yields true if either operand is a QNAN.
+-- true: always yields true, regardless of operands.
+--
+-- LLVMs icmp knows about:
+-- eq: yields true if the operands are equal, false otherwise. No sign interpretation is necessary or performed.
+-- ne: yields true if the operands are unequal, false otherwise. No sign interpretation is necessary or performed.
+-- ugt: interprets the operands as unsigned values and yields true if op1 is greater than op2.
+-- uge: interprets the operands as unsigned values and yields true if op1 is greater than or equal to op2.
+-- ult: interprets the operands as unsigned values and yields true if op1 is less than op2.
+-- ule: interprets the operands as unsigned values and yields true if op1 is less than or equal to op2.
+-- sgt: interprets the operands as signed values and yields true if op1 is greater than op2.
+-- sge: interprets the operands as signed values and yields true if op1 is greater than or equal to op2.
+-- slt: interprets the operands as signed values and yields true if op1 is less than op2.
+-- sle: interprets the operands as signed values and yields true if op1 is less than or equal to op2.
+
+data Cond
+    = ALWAYS -- b.al
+    | EQ     -- b.eq
+    | NE     -- b.ne
+    -- signed
+    | SLT    -- b.lt
+    | SLE    -- b.le
+    | SGE    -- b.ge
+    | SGT    -- b.gt
+    -- unsigned
+    | ULT    -- b.lo
+    | ULE    -- b.ls
+    | UGE    -- b.hs
+    | UGT    -- b.hi
+    -- ordered
+    | OLT    -- b.mi
+    | OLE    -- b.ls
+    | OGE    -- b.ge
+    | OGT    -- b.gt
+    -- unordered
+    | UOLT   -- b.lt
+    | UOLE   -- b.le
+    | UOGE   -- b.pl
+    | UOGT   -- b.hi
+    -- others
+    | NEVER  -- b.nv
+    | VS     -- oVerflow set
+    | VC     -- oVerflow clear
+    deriving Eq
diff --git a/compiler/GHC/CmmToAsm/AArch64/Instr.hs b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
new file mode 100644
index 0000000000..7d4eaa95f6
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -0,0 +1,758 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module GHC.CmmToAsm.AArch64.Instr
+
+where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.AArch64.Regs
+
+import GHC.CmmToAsm.Instr (RegUsage(..))
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+import GHC.CmmToAsm.Config
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import GHC.Cmm.CLabel
+import GHC.Utils.Outputable
+import GHC.Platform
+import GHC.Types.Unique.Supply
+
+import GHC.Utils.Panic
+
+import Control.Monad (replicateM)
+import Data.Maybe (fromMaybe)
+
+import GHC.Stack
+
+-- | TODO: verify this!
+stackFrameHeaderSize :: Platform -> Int
+stackFrameHeaderSize _ = 64
+
+-- | All registers are 8 byte wide.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to.
+stackAlign :: Int
+stackAlign = 16
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: NCGConfig -> Int
+maxSpillSlots config
+--  = 0 -- set to zero, to see when allocMoreStack has to fire.
+    = let platform = ncgPlatform config
+      in ((ncgSpillPreallocSize config - stackFrameHeaderSize platform)
+         `div` spillSlotSize) - 1
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: NCGConfig -> Int -> Int
+spillSlotToOffset config slot
+   = stackFrameHeaderSize (ncgPlatform config) + spillSlotSize * slot
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+-- RegUsage = RU [<read regs>] [<write regs>]
+
+instance Outputable RegUsage where
+    ppr (RU reads writes) = text "RegUsage(reads:" <+> ppr reads <> comma <+> text "writes:" <+> ppr writes <> char ')'
+
+regUsageOfInstr :: Platform -> Instr -> RegUsage
+regUsageOfInstr platform instr = case instr of
+  ANN _ i                  -> regUsageOfInstr platform i
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  CMN l r                  -> usage (regOp l ++ regOp r, [])
+  CMP l r                  -> usage (regOp l ++ regOp r, [])
+  MSUB dst src1 src2 src3  -> usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
+  MUL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  NEG dst src              -> usage (regOp src, regOp dst)
+  SDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  SUB dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  UDIV dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM dst src _ _         -> usage (regOp src, regOp dst)
+  UBFM dst src _ _         -> usage (regOp src, regOp dst)
+
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BIC dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  BICS dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EON dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  EOR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSL dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  LSR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  MOV dst src              -> usage (regOp src, regOp dst)
+  MOVK dst src             -> usage (regOp src, regOp dst)
+  MVN dst src              -> usage (regOp src, regOp dst)
+  ORR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  ROR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
+  TST src1 src2            -> usage (regOp src1 ++ regOp src2, [])
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t                      -> usage (regTarget t, [])
+  B t                      -> usage (regTarget t, [])
+  BCOND _ t                -> usage (regTarget t, [])
+  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET dst _               -> usage ([], regOp dst)
+  CBZ src _                -> usage (regOp src, [])
+  CBNZ src _               -> usage (regOp src, [])
+  -- 7. Load and Store Instructions --------------------------------------------
+  STR _ src dst            -> usage (regOp src ++ regOp dst, [])
+  LDR _ dst src            -> usage (regOp src, regOp dst)
+  -- TODO is this right? see STR, which I'm only partial about being right?
+  STP _ src1 src2 dst      -> usage (regOp src1 ++ regOp src2 ++ regOp dst, [])
+  LDP _ dst1 dst2 src      -> usage (regOp src, regOp dst1 ++ regOp dst2)
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY                    -> usage ([], [])
+
+  -- 9. Floating Point Instructions --------------------------------------------
+  FCVT dst src             -> usage (regOp src, regOp dst)
+  SCVTF dst src            -> usage (regOp src, regOp dst)
+  FCVTZS dst src           -> usage (regOp src, regOp dst)
+
+  _ -> panic "regUsageOfInstr"
+
+  where
+        -- filtering the usage is necessary, otherwise the register
+        -- allocator will try to allocate pre-defined fixed stg
+        -- registers as well, as they show up.
+        usage (src, dst) = RU (filter (interesting platform) src)
+                              (filter (interesting platform) dst)
+
+        regAddr :: AddrMode -> [Reg]
+        regAddr (AddrRegReg r1 r2) = [r1, r2]
+        regAddr (AddrRegImm r1 _)  = [r1]
+        regAddr (AddrReg r1)       = [r1]
+        regOp :: Operand -> [Reg]
+        regOp (OpReg _ r1) = [r1]
+        regOp (OpRegExt _ r1 _ _) = [r1]
+        regOp (OpRegShift _ r1 _ _) = [r1]
+        regOp (OpAddr a) = regAddr a
+        regOp (OpImm _) = []
+        regOp (OpImmShift _ _ _) = []
+        regTarget :: Target -> [Reg]
+        regTarget (TBlock _) = []
+        regTarget (TLabel _) = []
+        regTarget (TReg r1)  = [r1]
+
+        -- Is this register interesting for the register allocator?
+        interesting :: Platform -> Reg -> Bool
+        interesting _        (RegVirtual _)                 = True
+        interesting _        (RegReal (RealRegSingle (-1))) = False
+        interesting platform (RegReal (RealRegSingle i))    = freeReg platform i
+        interesting _        (RegReal (RealRegPair{}))
+            = panic "AArch64.Instr.interesting: no reg pairs on this arch"
+
+-- Save caller save registers
+-- This is x0-x18
+--
+-- For SIMD/FP Registers:
+-- Registers v8-v15 must be preserved by a callee across subroutine calls;
+-- the remaining registers (v0-v7, v16-v31) do not need to be preserved (or
+-- should be preserved by the caller). Additionally, only the bottom 64 bits
+-- of each value stored in v8-v15 need to be preserved [7]; it is the
+-- responsibility of the caller to preserve larger values.
+--
+-- .---------------------------------------------------------------------------------------------------------------------------------------------------------------.
+-- |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
+-- | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
+-- |== General Purpose registers ==================================================================================================================================|
+-- | <---- argument passing -------------> | IR | <------- tmp registers --------> | IP0| IP1| PL | <------------------- callee saved ------------> | FP | LR | SP |
+-- | <------ free registers --------------------------------------------------------------------> | BR | Sp | Hp | R1 | R2 | R3 | R4 | R5 | R6 | SL | -- | -- | -- |
+-- |== SIMD/FP Registers ==========================================================================================================================================|
+-- | <---- argument passing -------------> | <-- callee saved (lower 64 bits) ---> | <--------------------------------------- caller saved ----------------------> |
+-- | <------ free registers -------------> | F1 | F2 | F3 | F4 | D1 | D2 | D3 | D4 | <------ free registers -----------------------------------------------------> |
+-- '---------------------------------------------------------------------------------------------------------------------------------------------------------------'
+-- IR: Indirect result location register, IP: Intra-procedure register, PL: Platform register, FP: Frame pointer, LR: Link register, SP: Stack pointer
+-- BR: Base, SL: SpLim
+callerSavedRegisters :: [Reg]
+callerSavedRegisters
+    = map regSingle [0..18]
+    ++ map regSingle [32..39]
+    ++ map regSingle [48..63]
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+patchRegsOfInstr instr env = case instr of
+    -- 0. Meta Instructions
+    ANN d i        -> ANN d (patchRegsOfInstr i env)
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    ADD o1 o2 o3   -> ADD (patchOp o1) (patchOp o2) (patchOp o3)
+    CMN o1 o2      -> CMN (patchOp o1) (patchOp o2)
+    CMP o1 o2      -> CMP (patchOp o1) (patchOp o2)
+    MSUB o1 o2 o3 o4 -> MSUB (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    MUL o1 o2 o3   -> MUL (patchOp o1) (patchOp o2) (patchOp o3)
+    NEG o1 o2      -> NEG (patchOp o1) (patchOp o2)
+    SDIV o1 o2 o3  -> SDIV (patchOp o1) (patchOp o2) (patchOp o3)
+    SUB o1 o2 o3   -> SUB  (patchOp o1) (patchOp o2) (patchOp o3)
+    UDIV o1 o2 o3  -> UDIV (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    SBFM o1 o2 o3 o4 -> SBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+    UBFM o1 o2 o3 o4 -> UBFM (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
+    ANDS o1 o2 o3  -> ANDS (patchOp o1) (patchOp o2) (patchOp o3)
+    ASR o1 o2 o3   -> ASR  (patchOp o1) (patchOp o2) (patchOp o3)
+    BIC o1 o2 o3   -> BIC  (patchOp o1) (patchOp o2) (patchOp o3)
+    BICS o1 o2 o3  -> BICS (patchOp o1) (patchOp o2) (patchOp o3)
+    EON o1 o2 o3   -> EON  (patchOp o1) (patchOp o2) (patchOp o3)
+    EOR o1 o2 o3   -> EOR  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSL o1 o2 o3   -> LSL  (patchOp o1) (patchOp o2) (patchOp o3)
+    LSR o1 o2 o3   -> LSR  (patchOp o1) (patchOp o2) (patchOp o3)
+    MOV o1 o2      -> MOV  (patchOp o1) (patchOp o2)
+    MOVK o1 o2     -> MOVK (patchOp o1) (patchOp o2)
+    MVN o1 o2      -> MVN  (patchOp o1) (patchOp o2)
+    ORR o1 o2 o3   -> ORR  (patchOp o1) (patchOp o2) (patchOp o3)
+    ROR o1 o2 o3   -> ROR  (patchOp o1) (patchOp o2) (patchOp o3)
+    TST o1 o2      -> TST  (patchOp o1) (patchOp o2)
+
+    -- 4. Branch Instructions --------------------------------------------------
+    J t            -> J (patchTarget t)
+    B t            -> B (patchTarget t)
+    BL t rs ts     -> BL (patchTarget t) rs ts
+    BCOND c t      -> BCOND c (patchTarget t)
+
+    -- 5. Atomic Instructions --------------------------------------------------
+    -- 6. Conditional Instructions ---------------------------------------------
+    CSET o c       -> CSET (patchOp o) c
+    CBZ o l        -> CBZ (patchOp o) l
+    CBNZ o l       -> CBNZ (patchOp o) l
+    -- 7. Load and Store Instructions ------------------------------------------
+    STR f o1 o2    -> STR f (patchOp o1) (patchOp o2)
+    LDR f o1 o2    -> LDR f (patchOp o1) (patchOp o2)
+    STP f o1 o2 o3 -> STP f (patchOp o1) (patchOp o2) (patchOp o3)
+    LDP f o1 o2 o3 -> LDP f (patchOp o1) (patchOp o2) (patchOp o3)
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    DMBSY          -> DMBSY
+
+    -- 9. Floating Point Instructions ------------------------------------------
+    FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
+    SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
+    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
+
+    _ -> pprPanic "patchRegsOfInstr" (text $ show instr)
+    where
+        patchOp :: Operand -> Operand
+        patchOp (OpReg w r) = OpReg w (env r)
+        patchOp (OpRegExt w r x s) = OpRegExt w (env r) x s
+        patchOp (OpRegShift w r m s) = OpRegShift w (env r) m s
+        patchOp (OpAddr a) = OpAddr (patchAddr a)
+        patchOp op = op
+        patchTarget :: Target -> Target
+        patchTarget (TReg r) = TReg (env r)
+        patchTarget t = t
+        patchAddr :: AddrMode -> AddrMode
+        patchAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+        patchAddr (AddrRegImm r1 i)  = AddrRegImm (env r1) i
+        patchAddr (AddrReg r) = AddrReg (env r)
+--------------------------------------------------------------------------------
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+isJumpishInstr :: Instr -> Bool
+isJumpishInstr instr = case instr of
+    ANN _ i -> isJumpishInstr i
+    CBZ{} -> True
+    CBNZ{} -> True
+    J{} -> True
+    B{} -> True
+    BL{} -> True
+    BCOND{} -> True
+    _ -> False
+
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+jumpDestsOfInstr :: Instr -> [BlockId]
+jumpDestsOfInstr (ANN _ i) = jumpDestsOfInstr i
+jumpDestsOfInstr (CBZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (CBNZ _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
+jumpDestsOfInstr (BL t _ _) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr (BCOND _ t) = [ id | TBlock id <- [t]]
+jumpDestsOfInstr _ = []
+
+-- | Change the destination of this jump instruction.
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+patchJumpInstr instr patchF
+    = case instr of
+        ANN d i -> ANN d (patchJumpInstr i patchF)
+        CBZ r (TBlock bid) -> CBZ r (TBlock (patchF bid))
+        CBNZ r (TBlock bid) -> CBNZ r (TBlock (patchF bid))
+        J (TBlock bid) -> J (TBlock (patchF bid))
+        B (TBlock bid) -> B (TBlock (patchF bid))
+        BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
+        BCOND c (TBlock bid) -> BCOND c (TBlock (patchF bid))
+        _ -> pprPanic "patchJumpInstr" (text $ show instr)
+
+-- -----------------------------------------------------------------------------
+-- Note [Spills and Reloads]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~
+-- We reserve @RESERVED_C_STACK_BYTES@ on the C stack for spilling and reloading
+-- registers.  AArch64s maximum displacement for SP relative spills and reloads
+-- is essentially [-256,255], or [0, 0xFFF]*8 = [0, 32760] for 64bits.
+--
+-- The @RESERVED_C_STACK_BYTES@ is 16k, so we can't address any location in a
+-- single instruction.  The idea is to use the Inter Procedure 0 (ip0) register
+-- to perform the computations for larger offsets.
+--
+-- Using sp to compute the offset will violate assumptions about the stack pointer
+-- pointing to the top of the stack during signal handling.  As we can't force
+-- every signal to use its own stack, we have to ensure that the stack poitner
+-- always poitns to the top of the stack, and we can't use it for computation.
+--
+-- | An instruction to spill a register into a spill slot.
+mkSpillInstr
+   :: HasCallStack
+   => NCGConfig
+   -> Reg       -- register to spill
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkSpillInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkStrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkStrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+        mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+mkLoadInstr
+   :: NCGConfig
+   -> Reg       -- register to load
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> [Instr]
+
+mkLoadInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkLdrSp imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdrSp imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkIp0SpillAddr (imm .&~. 0xfff)
+                                                                     , mkLdrIp0 (imm .&.  0xfff)
+                                                                     ]
+    imm -> pprPanic "mkSpillInstr" (text "Unable to spill into" <+> int imm)
+    where
+        a .&~. b = a .&. (complement b)
+
+        fmt = case reg of
+            RegReal (RealRegSingle n) | n < 32 -> II64
+            _                                  -> FF64
+
+        mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
+        mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkLdrIp0 imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
+
+        off = spillSlotToOffset config slot
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+takeDeltaInstr :: Instr -> Maybe Int
+takeDeltaInstr (ANN _ i) = takeDeltaInstr i
+takeDeltaInstr (DELTA i) = Just i
+takeDeltaInstr _         = Nothing
+
+-- Not real instructions.  Just meta data
+isMetaInstr :: Instr -> Bool
+isMetaInstr instr
+ = case instr of
+    ANN _ i     -> isMetaInstr i
+    COMMENT{}   -> True
+    MULTILINE_COMMENT{} -> True
+    LOCATION{}  -> True
+    LDATA{}     -> True
+    NEWBLOCK{}  -> True
+    DELTA{}     -> True
+    PUSH_STACK_FRAME -> True
+    POP_STACK_FRAME -> True
+    _           -> False
+
+-- | Copy the value in a register to another one.
+-- Must work for all register classes.
+mkRegRegMoveInstr :: Reg -> Reg -> Instr
+mkRegRegMoveInstr src dst = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+
+-- | Take the source and destination from this reg -> reg move instruction
+-- or Nothing if it's not one
+takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+--takeRegRegMoveInstr (MOV (OpReg fmt dst) (OpReg fmt' src)) | fmt == fmt' = Just (src, dst)
+takeRegRegMoveInstr _ = Nothing
+
+-- | Make an unconditional jump instruction.
+mkJumpInstr :: BlockId -> [Instr]
+mkJumpInstr id = [B (TBlock id)]
+
+mkStackAllocInstr :: Platform -> Int -> [Instr]
+mkStackAllocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Alloc More Stack") $ SUB sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Alloc More Stack") (SUB sp sp (OpImm (ImmInt 4095))) : mkStackAllocInstr platform (n - 4095)
+mkStackAllocInstr _platform n = pprPanic "mkStackAllocInstr" (int n)
+
+mkStackDeallocInstr :: Platform -> Int -> [Instr]
+mkStackDeallocInstr platform n
+    | n == 0 = []
+    | n > 0 && n < 4096 = [ ANN (text "Dealloc More Stack") $ ADD sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Dealloc More Stack") (ADD sp sp (OpImm (ImmInt 4095))) : mkStackDeallocInstr platform (n - 4095)
+mkStackDeallocInstr _platform n = pprPanic "mkStackDeallocInstr" (int n)
+
+--
+-- See note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.AArch64.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = (zip entries (map mkBlockId uniqs))
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insn (BasicBlock id insns)
+        | Just new_blockid <- mapLookup id new_blockmap
+        = [ BasicBlock id $ alloc ++ [ B (TBlock new_blockid) ]
+          , BasicBlock new_blockid block' ]
+        | otherwise
+        = [ BasicBlock id block' ]
+        where
+          block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+        J _ -> dealloc ++ (insn : r)
+        ANN _ (J _) -> dealloc ++ (insn : r)
+        _other | jumpDestsOfInstr insn /= []
+            -> patchJumpInstr insn retarget : r
+        _other -> insn : r
+
+        where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insn code
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+-- Some additional (potential future) instructions are commented out. They are
+-- not needed yet for the backend but could be used in the future.
+data Instr
+    -- comment pseudo-op
+    = COMMENT SDoc
+    | MULTILINE_COMMENT SDoc
+
+    -- Annotated instruction. Should print <instr> # <doc>
+    | ANN SDoc Instr
+
+    -- location pseudo-op (file, line, col, name)
+    | LOCATION Int Int Int String
+
+    -- some static data spat out during code
+    -- generation.  Will be extracted before
+    -- pretty-printing.
+    | LDATA   Section RawCmmStatics
+
+    -- start a new basic block.  Useful during
+    -- codegen, removed later.  Preceding
+    -- instruction should be a jump, as per the
+    -- invariants for a BasicBlock (see Cmm).
+    | NEWBLOCK BlockId
+
+    -- specify current stack offset for
+    -- benefit of subsequent passes
+    | DELTA   Int
+
+    -- 0. Pseudo Instructions --------------------------------------------------
+    -- These are instructions not contained or only partially contained in the
+    -- official ISA, but make reading clearer. Some of them might even be
+    -- implemented in the assembler, but are not guaranteed to be portable.
+    -- | SXTB Operand Operand
+    -- | SXTH Operand Operand
+    -- | SXTW Operand Operand
+    -- | SXTX Operand Operand
+    | PUSH_STACK_FRAME
+    | POP_STACK_FRAME
+    -- 1. Arithmetic Instructions ----------------------------------------------
+    -- | ADC Operand Operand Operang -- rd = rn + rm + C
+    -- | ADCS ...
+    | ADD Operand Operand Operand -- rd = rn + rm
+    -- | ADDS Operand Operand Operand -- rd = rn + rm
+    -- | ADR ...
+    -- | ADRP ...
+    | CMN Operand Operand -- rd + op2
+    | CMP Operand Operand -- rd - op2
+    -- | MADD ...
+    -- | MNEG ...
+    | MSUB Operand Operand Operand Operand -- rd = ra - rn × rm
+    | MUL Operand Operand Operand -- rd = rn × rm
+    | NEG Operand Operand -- rd = -op2
+    -- | NEGS ...
+    -- | NGC ...
+    -- | NGCS ...
+    -- | SBC ...
+    -- | SBCS ...
+    | SDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | SMADDL ...
+    -- | SMNEGL ...
+    -- | SMSUBL ...
+    -- | SMULH ...
+    -- | SMULL ...
+    | SUB Operand Operand Operand -- rd = rn - op2
+    -- | SUBS ...
+    | UDIV Operand Operand Operand -- rd = rn ÷ rm
+    -- | UMADDL ...  -- Xd = Xa + Wn × Wm
+    -- | UMNEGL ... -- Xd = - Wn × Wm
+    -- | UMSUBL ... -- Xd = Xa - Wn × Wm
+    -- | UMULH ... -- Xd = (Xn × Xm)_127:64
+    -- | UMULL ... -- Xd = Wn × Wm
+
+    -- 2. Bit Manipulation Instructions ----------------------------------------
+    | SBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- SXTB = SBFM <Wd>, <Wn>, #0, #7
+    -- SXTH = SBFM <Wd>, <Wn>, #0, #15
+    -- SXTW = SBFM <Wd>, <Wn>, #0, #31
+    | UBFM Operand Operand Operand Operand -- rd = rn[i,j]
+    -- UXTB = UBFM <Wd>, <Wn>, #0, #7
+    -- UXTH = UBFM <Wd>, <Wn>, #0, #15
+
+    -- 3. Logical and Move Instructions ----------------------------------------
+    | AND Operand Operand Operand -- rd = rn & op2
+    | ANDS Operand Operand Operand -- rd = rn & op2
+    | ASR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | BIC Operand Operand Operand -- rd = rn & ~op2
+    | BICS Operand Operand Operand -- rd = rn & ~op2
+    | EON Operand Operand Operand -- rd = rn ⊕ ~op2
+    | EOR Operand Operand Operand -- rd = rn ⊕ op2
+    | LSL Operand Operand Operand -- rd = rn ≪ rm  or rd = rn ≪ #i, i is 6 bits
+    | LSR Operand Operand Operand -- rd = rn ≫ rm  or rd = rn ≫ #i, i is 6 bits
+    | MOV Operand Operand -- rd = rn  or  rd = #i
+    | MOVK Operand Operand
+    -- | MOVN Operand Operand
+    -- | MOVZ Operand Operand
+    | MVN Operand Operand -- rd = ~rn
+    | ORN Operand Operand Operand -- rd = rn | ~op2
+    | ORR Operand Operand Operand -- rd = rn | op2
+    | ROR Operand Operand Operand -- rd = rn ≫ rm  or  rd = rn ≫ #i, i is 6 bits
+    | TST Operand Operand -- rn & op2
+    -- Load and stores.
+    -- TODO STR/LDR might want to change to STP/LDP with XZR for the second register.
+    | STR Format Operand Operand -- str Xn, address-mode // Xn -> *addr
+    | LDR Format Operand Operand -- ldr Xn, address-mode // Xn <- *addr
+    | STP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn -> *addr, Xm -> *(addr + 8)
+    | LDP Format Operand Operand Operand -- stp Xn, Xm, address-mode // Xn <- *addr, Xm <- *(addr + 8)
+
+    -- Conditional instructions
+    | CSET Operand Cond   -- if(cond) op <- 1 else op <- 0
+
+    | CBZ Operand Target  -- if op == 0, then branch.
+    | CBNZ Operand Target -- if op /= 0, then branch.
+    -- Branching.
+    | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
+    | B Target            -- unconditional branching b/br. (To a blockid, label or register)
+    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
+    | BCOND Cond Target   -- branch with condition. b.<cond>
+
+    -- 8. Synchronization Instructions -----------------------------------------
+    | DMBSY
+    -- 9. Floating Point Instructions
+    -- Float ConVerT
+    | FCVT Operand Operand
+    -- Signed ConVerT Float
+    | SCVTF Operand Operand
+    -- Float ConVerT to Zero Signed
+    | FCVTZS Operand Operand
+
+instance Show Instr where
+    show (LDR _f o1 o2) = "LDR " ++ show o1 ++ ", " ++ show o2
+    show (MOV o1 o2) = "MOV " ++ show o1 ++ ", " ++ show o2
+    show _ = "missing"
+
+data Target
+    = TBlock BlockId
+    | TLabel CLabel
+    | TReg   Reg
+
+
+-- Extension
+-- {Unsigned|Signed}XT{Byte|Half|Word|Doube}
+data ExtMode
+    = EUXTB | EUXTH | EUXTW | EUXTX
+    | ESXTB | ESXTH | ESXTW | ESXTX
+    deriving (Eq, Show)
+
+data ShiftMode
+    = SLSL | SLSR | SASR | SROR
+    deriving (Eq, Show)
+
+
+-- We can also add ExtShift to Extension.
+-- However at most 3bits.
+type ExtShift = Int
+-- at most 6bits
+type RegShift = Int
+
+data Operand
+        = OpReg Width Reg            -- register
+        | OpRegExt Width Reg ExtMode ExtShift -- rm, <ext>[, <shift left>]
+        | OpRegShift Width Reg ShiftMode RegShift     -- rm, <shift>, <0-64>
+        | OpImm Imm            -- immediate value
+        | OpImmShift Imm ShiftMode RegShift
+        | OpAddr AddrMode       -- memory reference
+        deriving (Eq, Show)
+
+-- Smart constructors
+opReg :: Width -> Reg -> Operand
+opReg = OpReg
+
+xzr, wzr, sp, ip0 :: Operand
+xzr = OpReg W64 (RegReal (RealRegSingle (-1)))
+wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
+sp  = OpReg W64 (RegReal (RealRegSingle 31))
+ip0 = OpReg W64 (RegReal (RealRegSingle 16))
+
+_x :: Int -> Operand
+_x i = OpReg W64 (RegReal (RealRegSingle i))
+x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7  :: Operand
+x8,  x9,  x10, x11, x12, x13, x14, x15 :: Operand
+x16, x17, x18, x19, x20, x21, x22, x23 :: Operand
+x24, x25, x26, x27, x28, x29, x30, x31 :: Operand
+x0  = OpReg W64 (RegReal (RealRegSingle  0))
+x1  = OpReg W64 (RegReal (RealRegSingle  1))
+x2  = OpReg W64 (RegReal (RealRegSingle  2))
+x3  = OpReg W64 (RegReal (RealRegSingle  3))
+x4  = OpReg W64 (RegReal (RealRegSingle  4))
+x5  = OpReg W64 (RegReal (RealRegSingle  5))
+x6  = OpReg W64 (RegReal (RealRegSingle  6))
+x7  = OpReg W64 (RegReal (RealRegSingle  7))
+x8  = OpReg W64 (RegReal (RealRegSingle  8))
+x9  = OpReg W64 (RegReal (RealRegSingle  9))
+x10 = OpReg W64 (RegReal (RealRegSingle 10))
+x11 = OpReg W64 (RegReal (RealRegSingle 11))
+x12 = OpReg W64 (RegReal (RealRegSingle 12))
+x13 = OpReg W64 (RegReal (RealRegSingle 13))
+x14 = OpReg W64 (RegReal (RealRegSingle 14))
+x15 = OpReg W64 (RegReal (RealRegSingle 15))
+x16 = OpReg W64 (RegReal (RealRegSingle 16))
+x17 = OpReg W64 (RegReal (RealRegSingle 17))
+x18 = OpReg W64 (RegReal (RealRegSingle 18))
+x19 = OpReg W64 (RegReal (RealRegSingle 19))
+x20 = OpReg W64 (RegReal (RealRegSingle 20))
+x21 = OpReg W64 (RegReal (RealRegSingle 21))
+x22 = OpReg W64 (RegReal (RealRegSingle 22))
+x23 = OpReg W64 (RegReal (RealRegSingle 23))
+x24 = OpReg W64 (RegReal (RealRegSingle 24))
+x25 = OpReg W64 (RegReal (RealRegSingle 25))
+x26 = OpReg W64 (RegReal (RealRegSingle 26))
+x27 = OpReg W64 (RegReal (RealRegSingle 27))
+x28 = OpReg W64 (RegReal (RealRegSingle 28))
+x29 = OpReg W64 (RegReal (RealRegSingle 29))
+x30 = OpReg W64 (RegReal (RealRegSingle 30))
+x31 = OpReg W64 (RegReal (RealRegSingle 31))
+
+_d :: Int -> Operand
+_d = OpReg W64 . RegReal . RealRegSingle
+d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7  :: Operand
+d8,  d9,  d10, d11, d12, d13, d14, d15 :: Operand
+d16, d17, d18, d19, d20, d21, d22, d23 :: Operand
+d24, d25, d26, d27, d28, d29, d30, d31 :: Operand
+d0  = OpReg W64 (RegReal (RealRegSingle 32))
+d1  = OpReg W64 (RegReal (RealRegSingle 33))
+d2  = OpReg W64 (RegReal (RealRegSingle 34))
+d3  = OpReg W64 (RegReal (RealRegSingle 35))
+d4  = OpReg W64 (RegReal (RealRegSingle 36))
+d5  = OpReg W64 (RegReal (RealRegSingle 37))
+d6  = OpReg W64 (RegReal (RealRegSingle 38))
+d7  = OpReg W64 (RegReal (RealRegSingle 39))
+d8  = OpReg W64 (RegReal (RealRegSingle 40))
+d9  = OpReg W64 (RegReal (RealRegSingle 41))
+d10 = OpReg W64 (RegReal (RealRegSingle 42))
+d11 = OpReg W64 (RegReal (RealRegSingle 43))
+d12 = OpReg W64 (RegReal (RealRegSingle 44))
+d13 = OpReg W64 (RegReal (RealRegSingle 45))
+d14 = OpReg W64 (RegReal (RealRegSingle 46))
+d15 = OpReg W64 (RegReal (RealRegSingle 47))
+d16 = OpReg W64 (RegReal (RealRegSingle 48))
+d17 = OpReg W64 (RegReal (RealRegSingle 49))
+d18 = OpReg W64 (RegReal (RealRegSingle 50))
+d19 = OpReg W64 (RegReal (RealRegSingle 51))
+d20 = OpReg W64 (RegReal (RealRegSingle 52))
+d21 = OpReg W64 (RegReal (RealRegSingle 53))
+d22 = OpReg W64 (RegReal (RealRegSingle 54))
+d23 = OpReg W64 (RegReal (RealRegSingle 55))
+d24 = OpReg W64 (RegReal (RealRegSingle 56))
+d25 = OpReg W64 (RegReal (RealRegSingle 57))
+d26 = OpReg W64 (RegReal (RealRegSingle 58))
+d27 = OpReg W64 (RegReal (RealRegSingle 59))
+d28 = OpReg W64 (RegReal (RealRegSingle 60))
+d29 = OpReg W64 (RegReal (RealRegSingle 61))
+d30 = OpReg W64 (RegReal (RealRegSingle 62))
+d31 = OpReg W64 (RegReal (RealRegSingle 63))
+
+opRegUExt :: Width -> Reg -> Operand
+opRegUExt W64 r = OpRegExt W64 r EUXTX 0
+opRegUExt W32 r = OpRegExt W32 r EUXTW 0
+opRegUExt W16 r = OpRegExt W16 r EUXTH 0
+opRegUExt W8  r = OpRegExt W8  r EUXTB 0
+opRegUExt w  _r = pprPanic "opRegUExt" (text $ show w)
+
+opRegSExt :: Width -> Reg -> Operand
+opRegSExt W64 r = OpRegExt W64 r ESXTX 0
+opRegSExt W32 r = OpRegExt W32 r ESXTW 0
+opRegSExt W16 r = OpRegExt W16 r ESXTH 0
+opRegSExt W8  r = OpRegExt W8  r ESXTB 0
+opRegSExt w  _r = pprPanic "opRegSExt" (text $ show w)
diff --git a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
new file mode 100644
index 0000000000..3f413339c2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
@@ -0,0 +1,587 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.AArch64.Ppr (pprNatCmmDecl, pprInstr) where
+
+import GHC.Prelude hiding (EQ)
+
+import Data.Word
+import qualified Data.Array.Unsafe as U ( castSTUArray )
+import Data.Array.ST
+import Control.Monad.ST
+
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.CmmToAsm.AArch64.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Config
+import GHC.CmmToAsm.Types
+import GHC.CmmToAsm.Utils
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Types.Basic (Alignment, mkAlignment, alignmentBytes)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+
+import GHC.Types.Unique ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import GHC.Utils.Outputable
+
+import GHC.Utils.Panic
+
+pprProcAlignment :: NCGConfig -> SDoc
+pprProcAlignment config = maybe empty (pprAlign platform . mkAlignment) (ncgProcAlignment config)
+   where
+      platform = ncgPlatform config
+
+pprNatCmmDecl :: NCGConfig -> NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl config (CmmData section dats) =
+  pprSectionAlign config section $$ pprDatas config dats
+
+pprNatCmmDecl config proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  let platform = ncgPlatform config in
+  pprProcAlignment config $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign config (Section Text lbl) $$
+        -- do not
+        -- pprProcAlignment config $$
+        pprLabel platform lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock config top_info) blocks) $$
+        (if ncgDwarfEnabled config
+         then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+        pprSizeDecl platform lbl
+
+    Just (CmmStaticsRaw info_lbl _) ->
+      pprSectionAlign config (Section Text info_lbl) $$
+      -- pprProcAlignment config $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock config top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl platform info_lbl
+
+pprLabel :: Platform -> CLabel -> SDoc
+pprLabel platform lbl =
+   pprGloblDecl platform lbl
+   $$ pprTypeDecl platform lbl
+   $$ (pdoc platform lbl <> char ':')
+
+pprAlign :: Platform -> Alignment -> SDoc
+pprAlign _platform alignment
+        = text "\t.balign " <> int (alignmentBytes alignment)
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: Platform -> SectionType -> SDoc
+pprAlignForSection _platform _seg
+    -- .balign is stable, whereas .align is platform dependent.
+    = text "\t.balign 8" --  always 8
+
+instance Outputable Instr where
+    ppr = pprInstr genericPlatform
+
+-- | Print section header and appropriate alignment for that section.
+--
+-- This one will emit the header:
+--
+--     .section .text
+--     .balign 8
+--
+pprSectionAlign :: NCGConfig -> Section -> SDoc
+pprSectionAlign _config (Section (OtherSection _) _) =
+     panic "AArch64.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign config sec@(Section seg _) =
+    pprSectionHeader config sec
+    $$ pprAlignForSection (ncgPlatform config) seg
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: Platform -> CLabel -> SDoc
+pprSizeDecl platform lbl
+ = if osElfTarget (platformOS platform)
+   then text "\t.size" <+> pdoc platform lbl <> text ", .-" <> pdoc platform lbl
+   else empty
+
+pprBasicBlock :: NCGConfig -> LabelMap RawCmmStatics -> NatBasicBlock Instr
+              -> SDoc
+pprBasicBlock config info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel platform asmLbl $$
+    vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} optInstrs)) $$
+    (if  ncgDwarfEnabled config
+      then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+      else empty
+    )
+  where
+    -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
+    optInstrs = filter f instrs
+      where f (MOV o1 o2) | o1 == o2 = False
+            f _ = True
+
+    asmLbl = blockLbl blockid
+    platform = ncgPlatform config
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing   -> c
+       Just (CmmStaticsRaw info_lbl info) ->
+          --  pprAlignForSection platform Text $$
+           infoTableLoc $$
+           vcat (map (pprData config) info) $$
+           pprLabel platform info_lbl $$
+           c $$
+           (if ncgDwarfEnabled config
+             then ppr (mkAsmTempEndLabel info_lbl) <> char ':'
+             else empty)
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See [Note: Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr platform l
+      _other             -> empty
+
+pprDatas :: NCGConfig -> RawCmmStatics -> SDoc
+-- See Note [emit-time elimination of static indirections] in "GHC.Cmm.CLabel".
+pprDatas config (CmmStaticsRaw alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl (ncgPlatform config) alias
+    $$ text ".equiv" <+> pdoc (ncgPlatform config) alias <> comma <> pdoc (ncgPlatform config) (CmmLabel ind')
+
+pprDatas config (CmmStaticsRaw lbl dats)
+  = vcat (pprLabel platform lbl : map (pprData config) dats)
+   where
+      platform = ncgPlatform config
+
+pprData :: NCGConfig -> CmmStatic -> SDoc
+pprData _config (CmmString str) = pprString str
+pprData _config (CmmFileEmbed path) = pprFileEmbed path
+
+pprData config (CmmUninitialised bytes)
+ = let platform = ncgPlatform config
+   in if platformOS platform == OSDarwin
+         then text ".space " <> int bytes
+         else text ".skip "  <> int bytes
+
+pprData config (CmmStaticLit lit) = pprDataItem config lit
+
+pprGloblDecl :: Platform -> CLabel -> SDoc
+pprGloblDecl platform lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text "\t.globl " <> pdoc platform lbl
+
+-- Note [Always use objects for info tables]
+-- See discussion in X86.Ppr
+-- for why this is necessary.  Essentially we need to ensure that we never
+-- pass function symbols when we migth want to lookup the info table.  If we
+-- did, we could end up with procedure linking tables (PLT)s, and thus the
+-- lookup wouldn't point to the function, but into the jump table.
+--
+-- Fun fact: The LLVMMangler exists to patch this issue su on the LLVM side as
+-- well.
+pprLabelType' :: Platform -> CLabel -> SDoc
+pprLabelType' platform lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    functionOkInfoTable = platformTablesNextToCode platform &&
+      isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+-- this is called pprTypeAndSizeDecl in PPC.Ppr
+pprTypeDecl :: Platform -> CLabel -> SDoc
+pprTypeDecl platform lbl
+    = if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then text ".type " <> pdoc platform lbl <> text ", " <> pprLabelType' platform lbl
+      else empty
+
+pprDataItem :: NCGConfig -> CmmLit -> SDoc
+pprDataItem config lit
+  = vcat (ppr_item (cmmTypeFormat $ cmmLitType platform lit) lit)
+    where
+        platform = ncgPlatform config
+
+        imm = litToImm lit
+
+        ppr_item II8  _ = [text "\t.byte\t"  <> pprImm platform imm]
+        ppr_item II16 _ = [text "\t.short\t" <> pprImm platform imm]
+        ppr_item II32 _ = [text "\t.long\t"  <> pprImm platform imm]
+        ppr_item II64 _ = [text "\t.quad\t"  <> pprImm platform imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm platform (ImmInt b)) bs
+
+        ppr_item _ _ = pprPanic "pprDataItem:ppr_item" (text $ show lit)
+
+floatToBytes :: Float -> [Int]
+floatToBytes f
+   = runST (do
+        arr <- newArray_ ((0::Int),3)
+        writeArray arr 0 f
+        arr <- castFloatToWord8Array arr
+        i0 <- readArray arr 0
+        i1 <- readArray arr 1
+        i2 <- readArray arr 2
+        i3 <- readArray arr 3
+        return (map fromIntegral [i0,i1,i2,i3])
+     )
+
+castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8)
+castFloatToWord8Array = U.castSTUArray
+
+pprImm :: Platform -> Imm -> SDoc
+pprImm _ (ImmInt i)     = int i
+pprImm _ (ImmInteger i) = integer i
+pprImm p (ImmCLbl l)    = pdoc p l
+pprImm p (ImmIndex l i) = pdoc p l <> char '+' <> int i
+pprImm _ (ImmLit s)     = s
+
+-- TODO: See pprIm below for why this is a bad idea!
+pprImm _ (ImmFloat f)
+  | f == 0 = text "wzr"
+  | otherwise = float (fromRational f)
+pprImm _ (ImmDouble d)
+  | d == 0 = text "xzr"
+  | otherwise = double (fromRational d)
+
+pprImm p (ImmConstantSum a b) = pprImm p a <> char '+' <> pprImm p b
+pprImm p (ImmConstantDiff a b) = pprImm p a <> char '-'
+                   <> lparen <> pprImm p b <> rparen
+
+
+-- aarch64 GNU as uses // for comments.
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "#" <+> c
+
+asmDoubleslashComment :: SDoc -> SDoc
+asmDoubleslashComment c = whenPprDebug $ text "//" <+> c
+
+asmMultilineComment :: SDoc -> SDoc
+asmMultilineComment c = whenPprDebug $ text "/*" $+$ c $+$ text "*/"
+
+pprIm :: Platform -> Imm -> SDoc
+pprIm platform im = case im of
+  ImmInt i     -> char '#' <> int i
+  ImmInteger i -> char '#' <> integer i
+
+  -- TODO: This will only work for
+  -- The floating point value must be expressable as ±n ÷ 16 × 2^r,
+  -- where n and r are integers such that 16 ≤ n ≤ 31 and -3 ≤ r ≤ 4.
+  -- and 0 needs to be encoded as wzr/xzr.
+  --
+  -- Except for 0, we might want to either split it up into enough
+  -- ADD operations into an Integer register and then just bit copy it into
+  -- the double register? See the toBytes + fromRational above for data items.
+  -- This is something the x86 backend does.
+  --
+  -- We could also just turn them into statics :-/ Which is what the
+  -- PowerPC backend odes.
+  ImmFloat f | f == 0 -> text "wzr"
+  ImmFloat f -> char '#' <> float (fromRational f)
+  ImmDouble d | d == 0 -> text "xzr"
+  ImmDouble d -> char '#' <> double (fromRational d)
+  -- =<lbl> pseudo instruction!
+  ImmCLbl l    -> char '=' <> pdoc platform l
+  ImmIndex l o -> text "[=" <> pdoc platform l <> comma <+> char '#' <> int o <> char ']'
+  _            -> panic "AArch64.pprIm"
+
+pprExt :: ExtMode -> SDoc
+pprExt EUXTB = text "uxtb"
+pprExt EUXTH = text "uxth"
+pprExt EUXTW = text "uxtw"
+pprExt EUXTX = text "uxtx"
+pprExt ESXTB = text "sxtb"
+pprExt ESXTH = text "sxth"
+pprExt ESXTW = text "sxtw"
+pprExt ESXTX = text "sxtx"
+
+pprShift :: ShiftMode -> SDoc
+pprShift SLSL = text "lsl"
+pprShift SLSR = text "lsr"
+pprShift SASR = text "asr"
+pprShift SROR = text "ror"
+
+pprOp :: Platform -> Operand -> SDoc
+pprOp plat op = case op of
+  OpReg w r           -> pprReg w r
+  OpRegExt w r x 0 -> pprReg w r <> comma <+> pprExt x
+  OpRegExt w r x i -> pprReg w r <> comma <+> pprExt x <> comma <+> char '#' <> int i
+  OpRegShift w r s i -> pprReg w r <> comma <+> pprShift s <> comma <+> char '#' <> int i
+  OpImm im          -> pprIm plat im
+  OpImmShift im s i -> pprIm plat im <> comma <+> pprShift s <+> char '#' <> int i
+  -- TODO: Address compuation always use registers as 64bit -- is this correct?
+  OpAddr (AddrRegReg r1 r2) -> char '[' <+> pprReg W64 r1 <> comma <+> pprReg W64 r2 <+> char ']'
+  OpAddr (AddrRegImm r1 im) -> char '[' <+> pprReg W64 r1 <> comma <+> pprImm plat im <+> char ']'
+  OpAddr (AddrReg r1)       -> char '[' <+> pprReg W64 r1 <+> char ']'
+
+pprReg :: Width -> Reg -> SDoc
+pprReg w r = case r of
+  RegReal    (RealRegSingle i) -> ppr_reg_no w i
+  RegReal    (RealRegPair{})   -> panic "AArch64.pprReg: no reg pairs on this arch!"
+  -- virtual regs should not show up, but this is helpful for debugging.
+  RegVirtual (VirtualRegI u)   -> text "%vI_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegF u)   -> text "%vF_" <> pprUniqueAlways u
+  RegVirtual (VirtualRegD u)   -> text "%vD_" <> pprUniqueAlways u
+  _                            -> pprPanic "AArch64.pprReg" (text $ show r)
+
+  where
+    ppr_reg_no :: Width -> Int -> SDoc
+    ppr_reg_no w 31
+         | w == W64 = text "sp"
+         | w == W32 = text "wsp"
+
+    ppr_reg_no w i
+         | i < 0, w == W32 = text "wzr"
+         | i < 0, w == W64 = text "xzr"
+         | i < 0 = pprPanic "Invalid Zero Reg" (ppr w <+> int i)
+         -- General Purpose Registers
+         | i <= 31, w == W8  = text "w" <> int i      -- there are no byte or half
+         | i <= 31, w == W16 = text "w" <> int i      -- words... word will do.
+         | i <= 31, w == W32 = text "w" <> int i
+         | i <= 31, w == W64 = text "x" <> int i
+         | i <= 31 = pprPanic "Invalid Reg" (ppr w <+> int i)
+         -- Floating Point Registers
+         | i <= 63, w == W8  = text "b" <> int (i-32)
+         | i <= 63, w == W16 = text "h" <> int (i-32)
+         | i <= 63, w == W32 = text "s" <> int (i-32)
+         | i <= 63, w == W64 = text "d" <> int (i-32)
+         -- no support for 'q'uad in GHC's NCG yet.
+         | otherwise = text "very naughty powerpc register"
+
+isFloatOp :: Operand -> Bool
+isFloatOp (OpReg _ (RegReal (RealRegSingle i))) | i > 31 = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegF _))) = True
+isFloatOp (OpReg _ (RegVirtual (VirtualRegD _))) = True
+isFloatOp _ = False
+
+pprInstr :: Platform -> Instr -> SDoc
+pprInstr platform instr = case instr of
+  -- Meta Instructions ---------------------------------------------------------
+  COMMENT s  -> asmComment s
+  MULTILINE_COMMENT s -> asmMultilineComment s
+  ANN d i -> pprInstr platform i <+> asmDoubleslashComment d
+  LOCATION file line col _name
+    -> text "\t.loc" <+> ppr file <+> ppr line <+> ppr col
+  DELTA d    -> asmComment $ text ("\tdelta = " ++ show d)
+  NEWBLOCK _ -> panic "PprInstr: NEWBLOCK"
+  LDATA _ _  -> panic "pprInstr: LDATA"
+
+  -- Pseudo Instructions -------------------------------------------------------
+
+  PUSH_STACK_FRAME -> text "\tstp x29, x30, [sp, #-16]!"
+                   $$ text "\tmov x29, sp"
+
+  POP_STACK_FRAME -> text "\tldp x29, x30, [sp], #16"
+  -- ===========================================================================
+  -- AArch64 Instruction Set
+  -- 1. Arithmetic Instructions ------------------------------------------------
+  ADD  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tadd"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  CMN  o1 o2    -> text "\tcmn"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  CMP  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfcmp"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tcmp" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MSUB o1 o2 o3 o4 -> text "\tmsub" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  MUL  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tmul"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  NEG  o1 o2
+    | isFloatOp o1 && isFloatOp o2 -> text "\tfneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tneg"  <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SDIV o1 o2 o3 | isFloatOp o1 && isFloatOp o2 && isFloatOp o3
+    -> text "\tfdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  SDIV o1 o2 o3 -> text "\tsdiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  SUB  o1 o2 o3
+    | isFloatOp o1 && isFloatOp o2 && isFloatOp o3 -> text "\tfsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+    | otherwise -> text "\tsub"  <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  UDIV o1 o2 o3 -> text "\tudiv" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 2. Bit Manipulation Instructions ------------------------------------------
+  SBFM o1 o2 o3 o4 -> text "\tsbfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  UBFM o1 o2 o3 o4 -> text "\tubfm" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
+  -- 3. Logical and Move Instructions ------------------------------------------
+  AND o1 o2 o3  -> text "\tand" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ANDS o1 o2 o3 -> text "\tands" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ASR o1 o2 o3  -> text "\tasr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BIC o1 o2 o3  -> text "\tbic" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  BICS o1 o2 o3 -> text "\tbics" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EON o1 o2 o3  -> text "\teon" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  EOR o1 o2 o3  -> text "\teor" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSL o1 o2 o3  -> text "\tlsl" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LSR o1 o2 o3  -> text "\tlsr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  MOV o1 o2
+    | isFloatOp o1 || isFloatOp o2 -> text "\tfmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+    | otherwise -> text "\tmov" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MOVK o1 o2    -> text "\tmovk" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  MVN o1 o2     -> text "\tmvn" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  ORN o1 o2 o3  -> text "\torn" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ORR o1 o2 o3  -> text "\torr" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  ROR o1 o2 o3  -> text "\tror" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  TST o1 o2     -> text "\ttst" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  -- 4. Branch Instructions ----------------------------------------------------
+  J t            -> pprInstr platform (B t)
+  B (TBlock bid) -> text "\tb" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  B (TLabel lbl) -> text "\tb" <+> pdoc platform lbl
+  B (TReg r)     -> text "\tbr" <+> pprReg W64 r
+
+  BL (TBlock bid) _ _ -> text "\tbl" <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BL (TLabel lbl) _ _ -> text "\tbl" <+> pdoc platform lbl
+  BL (TReg r)     _ _ -> text "\tblr" <+> pprReg W64 r
+
+  BCOND c (TBlock bid) -> text "\t" <> pprBcond c <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  BCOND c (TLabel lbl) -> text "\t" <> pprBcond c <+> pdoc platform lbl
+  BCOND _ (TReg _)     -> panic "AArch64.ppr: No conditional branching to registers!"
+
+  -- 5. Atomic Instructions ----------------------------------------------------
+  -- 6. Conditional Instructions -----------------------------------------------
+  CSET o c  -> text "\tcset" <+> pprOp platform o <> comma <+> pprCond c
+
+  CBZ o (TBlock bid) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBZ o (TLabel lbl) -> text "\tcbz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbz) branching to registers!"
+
+  CBNZ o (TBlock bid) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform (mkLocalBlockLabel (getUnique bid))
+  CBNZ o (TLabel lbl) -> text "\tcbnz" <+> pprOp platform o <> comma <+> pdoc platform lbl
+  CBNZ _ (TReg _)     -> panic "AArch64.ppr: No conditional (cbnz) branching to registers!"
+
+  -- 7. Load and Store Instructions --------------------------------------------
+  -- NOTE: GHC may do whacky things where it only load the lower part of an
+  --       address. Not observing the correct size when loading will lead
+  --       inevitably to crashes.
+  STR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tstrh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  STR _f o1 o2 -> text "\tstr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+#if defined(darwin_HOST_OS)
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpage" $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@gotpageoff" <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@page" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> pdoc platform lbl <> text "@pageoff"
+#else
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]" $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> char '#' <> int off -- TODO: check that off is in 12bits.
+
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) | isForeignLabel lbl ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> text ":got:" <> pdoc platform lbl $$
+    text "\tldr" <+> pprOp platform o1 <> comma <+> text "[" <> pprOp platform o1 <> comma <+> text ":got_lo12:" <> pdoc platform lbl <> text "]"
+
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
+    text "\tadrp" <+> pprOp platform o1 <> comma <+> pdoc platform lbl $$
+    text "\tadd" <+> pprOp platform o1 <> comma <+> pprOp platform o1 <> comma <+> text ":lo12:" <> pdoc platform lbl
+#endif
+
+  LDR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsb" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+    text "\tldrsh" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  LDR _f o1 o2 -> text "\tldr" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+  STP _f o1 o2 o3 -> text "\tstp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+  LDP _f o1 o2 o3 -> text "\tldp" <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
+
+  -- 8. Synchronization Instructions -------------------------------------------
+  DMBSY -> text "\tdmb sy"
+  -- 8. Synchronization Instructions -------------------------------------------
+  FCVT o1 o2 -> text "\tfcvt" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  SCVTF o1 o2 -> text "\tscvtf" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+  FCVTZS o1 o2 -> text "\tfcvtzs" <+> pprOp platform o1 <> comma <+> pprOp platform o2
+
+pprBcond :: Cond -> SDoc
+pprBcond c = text "b." <> pprCond c
+
+pprCond :: Cond -> SDoc
+pprCond c = case c of
+  ALWAYS -> text "al" -- Always
+  EQ     -> text "eq" -- Equal
+  NE     -> text "ne" -- Not Equal
+
+  SLT    -> text "lt" -- Signed less than                  ; Less than, or unordered
+  SLE    -> text "le" -- Signed less than or equal         ; Less than or equal, or unordered
+  SGE    -> text "ge" -- Signed greater than or equal      ; Greater than or equal
+  SGT    -> text "gt" -- Signed greater than               ; Greater than
+
+  ULT    -> text "lo" -- Carry clear/ unsigned lower       ; less than
+  ULE    -> text "ls" -- Unsigned lower or same            ; Less than or equal
+  UGE    -> text "hs" -- Carry set/unsigned higher or same ; Greater than or equal, or unordered
+  UGT    -> text "hi" -- Unsigned higher                   ; Greater than, or unordered
+
+  NEVER  -> text "nv" -- Never
+  VS     -> text "vs" -- Overflow                          ; Unordered (at least one NaN operand)
+  VC     -> text "vc" -- No overflow                       ; Not unordered
+
+  -- Orderd variants.  Respecting NaN.
+  OLT    -> text "mi"
+  OLE    -> text "ls"
+  OGE    -> text "ge"
+  OGT    -> text "gt"
+
+  -- Unordered
+  UOLT   -> text "lt"
+  UOLE   -> text "le"
+  UOGE   -> text "pl"
+  UOGT   -> text "hi"
diff --git a/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs b/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
new file mode 100644
index 0000000000..8c3d081e92
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/RegInfo.hs
@@ -0,0 +1,31 @@
+module GHC.CmmToAsm.AArch64.RegInfo where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Instr
+import GHC.Cmm.BlockId
+import GHC.Cmm
+
+import GHC.Utils.Outputable
+
+data JumpDest = DestBlockId BlockId
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+
+-- TODO: documen what this does. See Ticket 19914
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+
+-- TODO: document what this does. See Ticket 19914
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+-- TODO: document what this does. See Ticket 19914
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics _ other_static = other_static
+
+-- TODO: document what this does. See Ticket 19914
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
diff --git a/compiler/GHC/CmmToAsm/AArch64/Regs.hs b/compiler/GHC/CmmToAsm/AArch64/Regs.hs
new file mode 100644
index 0000000000..fd1669eeac
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/AArch64/Regs.hs
@@ -0,0 +1,167 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.AArch64.Regs where
+
+import GHC.Prelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Types.Unique
+
+import GHC.Platform.Regs
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+allMachRegNos   :: [RegNo]
+allMachRegNos   = [0..31] ++ [32..63]
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+   = let isFree i = freeReg platform i
+     in  map RealRegSingle $ filter isFree allMachRegNos
+
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+allGpArgRegs :: [Reg]
+allGpArgRegs = map regSingle [0..7]
+allFpArgRegs :: [Reg]
+allFpArgRegs = map regSingle [32..39]
+
+-- STG:
+-- 19: Base
+-- 20: Sp
+-- 21: Hp
+-- 22-27: R1-R6
+-- 28: SpLim
+
+-- This is the STG Sp reg.
+-- sp :: Reg
+-- sp = regSingle 20
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+        = AddrRegReg    Reg Reg
+        | AddrRegImm    Reg Imm
+        | AddrReg       Reg
+        deriving (Eq, Show)
+
+-- -----------------------------------------------------------------------------
+-- Immediates
+
+data Imm
+  = ImmInt      Int
+  | ImmInteger  Integer     -- Sigh.
+  | ImmCLbl     CLabel      -- AbstractC Label (with baggage)
+  | ImmLit      SDoc        -- Simple string
+  | ImmIndex    CLabel Int
+  | ImmFloat    Rational
+  | ImmDouble   Rational
+  | ImmConstantSum Imm Imm
+  | ImmConstantDiff Imm Imm
+  deriving (Eq, Show)
+
+instance Show SDoc where
+  show = showPprUnsafe . ppr
+
+instance Eq SDoc where
+  lhs == rhs = show lhs == show rhs
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w)        = ImmInteger (narrowS w i)
+                -- narrow to the width: a CmmInt might be out of
+                -- range, but we assume that ImmInteger only contains
+                -- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32)    = ImmFloat f
+litToImm (CmmFloat f W64)    = ImmDouble f
+litToImm (CmmLabel l)        = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+                             = ImmConstantSum
+                               (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                               (ImmInt off)
+litToImm _                   = panic "AArch64.Regs.litToImm: no match"
+
+
+-- == To satisfy GHC.CmmToAsm.Reg.Target =======================================
+
+-- squeese functions for the graph allocator -----------------------------------
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 1     -- first fp reg is 32
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 0
+                        | otherwise     -> 1
+
+                RealRegPair{}           -> 0
+
+        _other -> 0
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   | not (isFloatFormat format) = VirtualRegI u
+   | otherwise
+   = case format of
+        FF32    -> VirtualRegD u
+        FF64    -> VirtualRegD u
+        _       -> panic "AArch64.mkVirtualReg"
+
+{-# INLINE classOfRealReg      #-}
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg (RealRegSingle i)
+        | i < 32        = RcInteger
+        | otherwise     = RcDouble
+
+classOfRealReg (RealRegPair{})
+        = panic "regClass(ppr): no reg pairs on this architecture"
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+        RcInteger       -> text "blue"
+        RcFloat         -> text "red"
+        RcDouble        -> text "green"
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
index e9047256e8..b8fb5706cb 100644
--- a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -216,6 +216,7 @@ dwarfRegNo p r = case platformArch p of
     | r == xmm14 -> 31
     | r == xmm15 -> 32
   ArchPPC_64 _ -> fromIntegral $ toRegNo r
+  ArchAArch64  -> fromIntegral $ toRegNo r
   _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
 
 -- | Virtual register number to use for return address.
@@ -228,4 +229,5 @@ dwarfReturnRegNo p
     ArchX86    -> 8  -- eip
     ArchX86_64 -> 16 -- rip
     ArchPPC_64 ELF_V2 -> 65 -- lr (link register)
+    ArchAArch64-> 30
     _other     -> error "dwarfReturnRegNo: Unsupported platform!"
diff --git a/compiler/GHC/CmmToAsm/Format.hs b/compiler/GHC/CmmToAsm/Format.hs
index 207de095ae..390ef29bd2 100644
--- a/compiler/GHC/CmmToAsm/Format.hs
+++ b/compiler/GHC/CmmToAsm/Format.hs
@@ -12,6 +12,7 @@ module GHC.CmmToAsm.Format (
     Format(..),
     intFormat,
     floatFormat,
+    isIntFormat,
     isFloatFormat,
     cmmTypeFormat,
     formatToWidth,
@@ -73,6 +74,9 @@ floatFormat width
 
         other   -> pprPanic "Format.floatFormat" (ppr other)
 
+-- | Check if a format represent an integer value.
+isIntFormat :: Format -> Bool
+isIntFormat = not . isFloatFormat
 
 -- | Check if a format represents a floating point value.
 isFloatFormat :: Format -> Bool
diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs
index 0a62c1d3bb..bc2e2969e6 100644
--- a/compiler/GHC/CmmToAsm/Instr.hs
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -31,6 +31,7 @@ data RegUsage
                 reads :: [Reg],
                 writes :: [Reg]
                 }
+        deriving Show
 
 -- | No regs read or written to.
 noUsage :: RegUsage
@@ -90,7 +91,7 @@ class Instruction instr where
                 -> Reg          -- ^ the reg to spill
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ spill slot to use
-                -> instr
+                -> [instr]        -- ^ instructions
 
 
         -- | An instruction to reload a register from a spill slot.
@@ -99,7 +100,7 @@ class Instruction instr where
                 -> Reg          -- ^ the reg to reload.
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ the spill slot to use
-                -> instr
+                -> [instr]        -- ^ instructions
 
         -- | See if this instruction is telling us the current C stack delta
         takeDeltaInstr
@@ -157,3 +158,6 @@ class Instruction instr where
 
         -- | Pretty-print an instruction
         pprInstr :: Platform -> instr -> SDoc
+
+        -- Create a comment instruction
+        mkComment :: SDoc -> [instr]
diff --git a/compiler/GHC/CmmToAsm/PIC.hs b/compiler/GHC/CmmToAsm/PIC.hs
index 7fe90c3ec6..81ce9d34a9 100644
--- a/compiler/GHC/CmmToAsm/PIC.hs
+++ b/compiler/GHC/CmmToAsm/PIC.hs
@@ -122,6 +122,15 @@ cmmMakeDynamicReference config referenceKind lbl
               addImport stub
               return $ CmmLit $ CmmLabel stub
 
+        -- GOT relative loads work differently on AArch64.  We don't do two
+        -- step loads. The got symbol is loaded directly, and not through an
+        -- additional load. Thus we do not need the CmmLoad decoration we have
+        -- on other platforms.
+        AccessViaSymbolPtr | ArchAArch64 <- platformArch platform -> do
+              let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+              addImport symbolPtr
+              return $ cmmMakePicReference config symbolPtr
+
         AccessViaSymbolPtr -> do
               let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
               addImport symbolPtr
@@ -135,7 +144,6 @@ cmmMakeDynamicReference config referenceKind lbl
                 -- so just jump there if it's a call or a jump
               _ -> return $ CmmLit $ CmmLabel lbl
 
-
 -- -----------------------------------------------------------------------------
 -- Create a position independent reference to a label.
 -- (but do not bother with dynamic linking).
@@ -150,6 +158,11 @@ cmmMakePicReference config lbl
   | OSMinGW32 <- platformOS platform
   = CmmLit $ CmmLabel lbl
 
+  -- no pic base reg on AArch64, however indicate this symbol should go through
+  -- the global offset table (GOT).
+  | ArchAArch64 <- platformArch platform
+  = CmmLit $ CmmLabel lbl
+
   | OSAIX <- platformOS platform
   = CmmMachOp (MO_Add W32)
           [ CmmReg (CmmGlobal PicBaseReg)
@@ -241,6 +254,20 @@ howToAccessLabel config _arch OSMinGW32 _kind lbl
         | otherwise
         = AccessDirectly
 
+-- On AArch64, relocations for JUMP and CALL will be emitted with 26bits, this
+-- is enough for ~64MB of range. Anything else will need to go through a veneer,
+-- which is the job of the linker to build.  We might only want to lookup
+-- Data References through the GOT.
+howToAccessLabel config ArchAArch64 _os _kind lbl
+        | not (ncgExternalDynamicRefs config)
+        = AccessDirectly
+
+        | labelDynamic config lbl
+        = AccessViaSymbolPtr
+
+        | otherwise
+        = AccessDirectly
+
 
 -- Mach-O (Darwin, Mac OS X)
 --
@@ -275,7 +302,7 @@ howToAccessLabel config arch OSDarwin JumpReference lbl
         -- dyld code stubs don't work for tailcalls because the
         -- stack alignment is only right for regular calls.
         -- Therefore, we have to go via a symbol pointer:
-        | arch == ArchX86 || arch == ArchX86_64
+        | arch == ArchX86 || arch == ArchX86_64 || arch == ArchAArch64
         , labelDynamic config lbl
         = AccessViaSymbolPtr
 
@@ -283,15 +310,15 @@ howToAccessLabel config arch OSDarwin JumpReference lbl
 howToAccessLabel config arch OSDarwin _kind lbl
         -- Code stubs are the usual method of choice for imported code;
         -- not needed on x86_64 because Apple's new linker, ld64, generates
-        -- them automatically.
+        -- them automatically, neither on Aarch64 (arm64).
         | arch /= ArchX86_64
+        , arch /= ArchAArch64
         , labelDynamic config lbl
         = AccessViaStub
 
         | otherwise
         = AccessDirectly
 
-
 ----------------------------------------------------------------------------
 -- AIX
 
@@ -616,7 +643,9 @@ pprImportedSymbol config importedLbl = case (arch,os) of
         | otherwise
         -> empty
 
-   (_, OSDarwin) -> empty
+   (ArchAArch64, OSDarwin)
+        -> empty
+
 
 
    -- XCOFF / AIX
diff --git a/compiler/GHC/CmmToAsm/PPC.hs b/compiler/GHC/CmmToAsm/PPC.hs
index 148fd1b4b2..d38eb84c64 100644
--- a/compiler/GHC/CmmToAsm/PPC.hs
+++ b/compiler/GHC/CmmToAsm/PPC.hs
@@ -57,5 +57,4 @@ instance Instruction PPC.Instr where
    mkStackAllocInstr   = PPC.mkStackAllocInstr
    mkStackDeallocInstr = PPC.mkStackDeallocInstr
    pprInstr            = PPC.pprInstr
-
-
+   mkComment           = const []
diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs
index 92567989ed..54a73f24a9 100644
--- a/compiler/GHC/CmmToAsm/PPC/Instr.hs
+++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs
@@ -533,7 +533,7 @@ mkSpillInstr
    -> Reg       -- register to spill
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 mkSpillInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -550,7 +550,7 @@ mkSpillInstr config reg delta slot
                 Just _  -> ST
                 Nothing -> STFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 mkLoadInstr
@@ -558,7 +558,7 @@ mkLoadInstr
    -> Reg       -- register to load
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 mkLoadInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -575,7 +575,7 @@ mkLoadInstr config reg delta slot
                 Just _  -> LD
                 Nothing -> LDFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 -- | The size of a minimal stackframe header including minimal
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
index 1050fbaa96..83f581cac4 100644
--- a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -111,7 +111,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts excl
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 15
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- We should be able to allocate *a lot* more in princple.
+                            -- essentially all 32 - SP, so 31, we'd trash the link reg
+                            -- as well as the platform and all others though.
+                            ArchAArch64   -> 18
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -143,7 +146,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclus
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 0
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            -- we can in princple address all the float regs as
+                            -- segments. So we could have 64 Float regs. Or
+                            -- 128 Half regs, or even 256 Byte regs.
+                            ArchAArch64   -> 0
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
@@ -177,7 +183,7 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
                             ArchPPC_64 _  -> 20
                             ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchAArch64   -> panic "trivColorable ArchAArch64"
+                            ArchAArch64   -> 32
                             ArchAlpha     -> panic "trivColorable ArchAlpha"
                             ArchMipseb    -> panic "trivColorable ArchMipseb"
                             ArchMipsel    -> panic "trivColorable ArchMipsel"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
index 88fdcd6bce..a9a4545f62 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -110,10 +110,11 @@ import GHC.CmmToAsm.Reg.Linear.StackMap
 import GHC.CmmToAsm.Reg.Linear.FreeRegs
 import GHC.CmmToAsm.Reg.Linear.Stats
 import GHC.CmmToAsm.Reg.Linear.JoinToTargets
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 import GHC.CmmToAsm.Reg.Target
 import GHC.CmmToAsm.Reg.Liveness
 import GHC.CmmToAsm.Reg.Utils
@@ -121,6 +122,7 @@ import GHC.CmmToAsm.Instr
 import GHC.CmmToAsm.Config
 import GHC.CmmToAsm.Types
 import GHC.Platform.Reg
+import GHC.Platform.Reg.Class (RegClass(..))
 
 import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
@@ -202,7 +204,7 @@ regAlloc _ (CmmProc _ _ _ _)
 --   an entry in the block map or it is the first block.
 --
 linearRegAlloc
-        :: forall instr. Instruction instr
+        :: forall instr. (Instruction instr)
         => NCGConfig
         -> [BlockId] -- ^ entry points
         -> BlockMap RegSet
@@ -220,7 +222,7 @@ linearRegAlloc config entry_ids block_live sccs
       ArchSPARC64    -> panic "linearRegAlloc ArchSPARC64"
       ArchPPC        -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchARM _ _ _  -> panic "linearRegAlloc ArchARM"
-      ArchAArch64    -> panic "linearRegAlloc ArchAArch64"
+      ArchAArch64    -> go $ (frInitFreeRegs platform :: AArch64.FreeRegs)
       ArchPPC_64 _   -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
       ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
       ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
@@ -487,7 +489,7 @@ isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
 
 
 genRaInsn :: forall freeRegs instr.
-             OutputableRegConstraint freeRegs instr
+             (OutputableRegConstraint freeRegs instr)
           => BlockMap RegSet
           -> [instr]
           -> BlockId
@@ -497,7 +499,7 @@ genRaInsn :: forall freeRegs instr.
           -> RegM freeRegs ([instr], [NatBasicBlock instr])
 
 genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
---   pprTraceM "genRaInsn" $ ppr (block_id, instr)
+-- pprTraceM "genRaInsn" $ ppr (block_id, instr)
   platform <- getPlatform
   case regUsageOfInstr platform instr of { RU read written ->
     do
@@ -509,19 +511,20 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
     -- so using nub isn't a problem).
     let virt_read       = nub [ vr      | (RegVirtual vr) <- read ] :: [VirtualReg]
 
-    -- debugging
-{-    freeregs <- getFreeRegsR
-    assig    <- getAssigR
-    pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn"
-        (ppr instr
-                $$ text "r_dying      = " <+> ppr r_dying
-                $$ text "w_dying      = " <+> ppr w_dying
-                $$ text "virt_read    = " <+> ppr virt_read
-                $$ text "virt_written = " <+> ppr virt_written
-                $$ text "freeregs     = " <+> text (show freeregs)
-                $$ text "assig        = " <+> ppr assig)
-        $ do
--}
+--     do
+--         let real_read       = nub [ rr      | (RegReal rr) <- read]
+--         freeregs <- getFreeRegsR
+--         assig    <- getAssigR
+
+--         pprTraceM "genRaInsn"
+--                 (          text "block        = " <+> ppr block_id
+--                         $$ text "instruction  = " <+> ppr instr
+--                         $$ text "r_dying      = " <+> ppr r_dying
+--                         $$ text "w_dying      = " <+> ppr w_dying
+--                         $$ text "read         = " <+> ppr real_read    <+> ppr virt_read
+--                         $$ text "written      = " <+> ppr real_written <+> ppr virt_written
+--                         $$ text "freeregs     = " <+> ppr freeregs
+--                         $$ text "assign       = " <+> ppr assig)
 
     -- (a), (b) allocate real regs for all regs read by this instruction.
     (r_spills, r_allocd) <-
@@ -580,7 +583,6 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                         Nothing -> x
                         Just y  -> y
 
-
     -- (j) free up stack slots for dead spilled regs
     -- TODO (can't be bothered right now)
 
@@ -592,7 +594,32 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                                  | src == dst   -> []
                                 _               -> [patched_instr]
 
-    let code = concat [ squashed_instr, w_spills, reverse r_spills, clobber_saves, new_instrs ]
+    -- On the use of @reverse@ below.
+    -- Since we can have spills and reloads produce multiple instructions
+    -- we need to ensure they are emitted in the correct order.  We used to only
+    -- emit single instructions in mkSpill/mkReload/mkRegRegMove.
+    -- As such order of spills and reloads didn't matter.  However,  with
+    -- mutliple instructions potentially issued by those functions we need to be
+    -- careful to not break execution order. Reversing the spills (clobber will
+    -- also spill), will ensure they are emitted in the right order.
+    --
+    -- See also Ticket 19910 for changing the return type from [] to OrdList.
+
+    -- For debugging, uncomment the follow line and the mkComment lines.
+    -- u <- getUniqueR
+    let code = concat [ --  mkComment (text "<genRaInsn(" <> ppr u <> text ")>")
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):squashed>")]
+                        squashed_instr
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):w_spills>")
+                      , reverse w_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):r_spills>")
+                      , reverse r_spills
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):clobber_saves>")
+                      , reverse clobber_saves
+                        -- ,mkComment (text "<genRaInsn(" <> ppr u <> text "):new_instrs>")
+                      , new_instrs
+                        -- ,mkComment (text "</genRaInsn(" <> ppr u <> text ")>")
+                      ]
 
 --    pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
 --    pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
@@ -609,6 +636,7 @@ releaseRegs regs = do
   platform <- getPlatform
   assig <- getAssigR
   free <- getFreeRegsR
+
   let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return ()
       loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs
       loop assig !free (r:rs) =
@@ -662,8 +690,9 @@ saveClobberedTemps clobbered dying
 
         (instrs,assig') <- clobber assig [] to_spill
         setAssigR assig'
-        return instrs
-
+        return $ -- mkComment (text "<saveClobberedTemps>") ++
+                 instrs
+--              ++ mkComment (text "</saveClobberedTemps>")
    where
      -- See Note [UniqFM and the register allocator]
      clobber :: RegMap Loc -> [instr] -> [(Unique,RealReg)] -> RegM freeRegs ([instr], RegMap Loc)
@@ -700,7 +729,7 @@ saveClobberedTemps clobbered dying
 
                   let new_assign  = addToUFM_Directly assig temp (InBoth reg slot)
 
-                  clobber new_assign (spill : instrs) rest
+                  clobber new_assign (spill ++ instrs) rest
 
 
 
@@ -714,7 +743,17 @@ clobberRegs []
 clobberRegs clobbered
  = do   platform <- getPlatform
         freeregs <- getFreeRegsR
-        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
+
+        let gpRegs  = frGetFreeRegs platform RcInteger freeregs :: [RealReg]
+            fltRegs = frGetFreeRegs platform RcFloat   freeregs :: [RealReg]
+            dblRegs = frGetFreeRegs platform RcDouble  freeregs :: [RealReg]
+
+        let extra_clobbered = [ r | r <- clobbered
+                                  , r `elem` (gpRegs ++ fltRegs ++ dblRegs) ]
+
+        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs extra_clobbered
+
+        -- setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
 
         assig           <- getAssigR
         setAssigR $! clobber assig (nonDetUFMToList assig)
@@ -909,10 +948,7 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
                         | (temp_to_push_out, (my_reg :: RealReg)) : _
                                         <- candidates_inReg
                         = do
-                                (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
-                                let spill_store  = (if reading then id else reverse)
-                                                        [ -- COMMENT (fsLit "spill alloc")
-                                                           spill_insn ]
+                                (spill_store, slot) <- spillR (RegReal my_reg) temp_to_push_out
 
                                 -- record that this temp was spilled
                                 recordSpill (SpillAlloc temp_to_push_out)
@@ -962,7 +998,7 @@ loadTemp vreg (ReadMem slot) hreg spills
  = do
         insn <- loadR (RegReal hreg) slot
         recordSpill (SpillLoad $ getUnique vreg)
-        return  $  {- COMMENT (fsLit "spill load") : -} insn : spills
+        return  $  {- mkComment (text "spill load") : -} insn ++ spills
 
 loadTemp _ _ _ spills =
    return spills
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
new file mode 100644
index 0000000000..50299c4e74
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/AArch64.hs
@@ -0,0 +1,137 @@
+module GHC.CmmToAsm.Reg.Linear.AArch64 where
+
+import GHC.Prelude
+
+import GHC.CmmToAsm.AArch64.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Utils.Outputable
+import GHC.Utils.Panic
+import GHC.Platform
+
+import Data.Word
+
+import GHC.Stack
+-- AArch64 has 32 64bit general purpose register r0..r30, and zr/sp
+-- AArch64 has 32 128bit floating point registers v0..v31 as part of the NEON
+-- extension in Armv8-A.
+--
+-- Armv8-A is a fundamental change to the Arm architecture. It supports the
+-- 64-bit Execution state called “AArch64”, and a new 64-bit instruction set
+-- “A64”. To provide compatibility with the Armv7-A (32-bit architecture)
+-- instruction set, a 32-bit variant of Armv8-A “AArch32” is provided. Most of
+-- existing Armv7-A code can be run in the AArch32 execution state of Armv8-A.
+--
+-- these can be addresses as q/d/s/h/b 0..31, or v.f<size>[idx]
+-- where size is 64, 32, 16, 8, ... and the index i allows us
+-- to access the given part.
+--
+-- History of Arm Adv SIMD
+-- .---------------------------------------------------------------------------.
+-- | Armv6                  | Armv7-A                | Armv8-A AArch64         |
+-- | SIMD extension         | NEON                   | NEON                    |
+-- |===========================================================================|
+-- | - Operates on 32-bit   | - Separate reg. bank,  | - Separate reg. bank,   |
+-- |   GP ARM registers     |    32x64-bit NEON regs |   32x128-bit NEON regs  |
+-- | - 8-bit/16-bit integer | - 8/16/32/64-bit int   | - 8/16/32/64-bit int    |
+-- |                        | - Single percision fp  | - Single percision fp   |
+-- |                        |                        | - Double precision fp   |
+-- |                        |                        | - Single/Double fp are  |
+-- |                        |                        |   IEEE compliant        |
+-- | - 2x16-bit/4x8-bit ops | - Up to 16x8-bit ops   | - Up to 16x8-bit ops    |
+-- |   per instruction      |   per instruction      |   per instruction       |
+-- '---------------------------------------------------------------------------'
+
+data FreeRegs = FreeRegs !Word32 !Word32
+
+instance Show FreeRegs where
+  show (FreeRegs g f) = "FreeRegs: " ++ showBits g ++ "; " ++ showBits f
+
+instance Outputable FreeRegs where
+    ppr (FreeRegs g f) = text "   " <+> foldr (\i x -> pad_int i    <+> x) (text "") [0..31]
+                      $$ text "GPR" <+> foldr (\i x -> show_bit g i <+> x) (text "") [0..31]
+                      $$ text "FPR" <+> foldr (\i x -> show_bit f i <+> x) (text "") [0..31]
+      where pad_int i | i < 10 = char ' ' <> int i
+            pad_int i = int i
+            -- remember bit = 1 means it's available.
+            show_bit bits bit | testBit bits bit = text "  "
+            show_bit _    _ = text " x"
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+showBits :: Word32 -> String
+showBits w = map (\i -> if testBit w i then '1' else '0') [0..31]
+
+-- FR instance implementation (See Linear.FreeRegs)
+allocateReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31 && testBit f (r - 32) = FreeRegs g (clearBit f (r - 32))
+    | r < 32 && testBit g r = FreeRegs (clearBit g r) f
+    | r > 31 = panic $ "Linear.AArch64.allocReg: double allocation of float reg v" ++ show (r - 32) ++ "; " ++ showBits f
+    | otherwise = pprPanic "Linear.AArch64.allocReg" $ text ("double allocation of gp reg x" ++ show r ++ "; " ++ showBits g)
+allocateReg _ _ = panic "Linear.AArch64.allocReg: bad reg"
+
+-- we start from 28 downwards... the logic is similar to the ppc logic.
+-- 31 is Stack Pointer
+-- 30 is Link Register
+-- 29 is Stack Frame (by convention)
+-- 19-28 are callee save
+-- the lower ones are all caller save
+
+-- For this reason someone decided to give aarch64 only 6 regs for
+-- STG:
+-- 19: Base
+-- 20: Sp
+-- 21: Hp
+-- 22-27: R1-R6
+-- 28: SpLim
+
+-- For LLVM code gen interop:
+-- See https://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150119/253722.html
+-- and the current ghccc implementation here:
+-- https://github.com/llvm/llvm-project/blob/161ae1f39816edf667aaa190bce702a86879c7bd/llvm/lib/Target/AArch64/AArch64CallingConvention.td#L324-L363
+-- and https://gitlab.haskell.org/ghc/ghc/-/wikis/commentary/compiler/generated-code
+-- for the STG discussion.
+{- For reference the ghcc from the link above:
+let Entry = 1 in
+def CC_AArch64_GHC : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+-}
+
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]
+getFreeRegs cls (FreeRegs g f)
+  | RcFloat   <- cls = [] -- For now we only support double and integer registers, floats will need to be promoted.
+  | RcDouble  <- cls = go 32 f 31
+  | RcInteger <- cls = go  0 g 18
+    where
+        go _   _ i | i < 0 = []
+        go off x i | testBit x i = RealRegSingle (off + i) : (go off x $! i - 1)
+                   | otherwise   = go off x $! i - 1
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+releaseReg :: HasCallStack => RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+  | r > 31 && testBit f (r - 32) = pprPanic "Linear.AArch64.releaseReg" (text  "can't release non-allocated reg v" <> int (r - 32))
+  | r < 32 && testBit g r = pprPanic "Linear.AArch64.releaseReg" (text "can't release non-allocated reg x" <> int r)
+  | r > 31 = FreeRegs g (setBit f (r - 32))
+  | otherwise = FreeRegs (setBit g r) f
+releaseReg _ _ = pprPanic "Linear.AArch64.releaseReg" (text "bad reg")
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
index d501718c4a..3ae0fa140d 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -25,14 +25,16 @@ import GHC.Platform
 --      getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f
 --      allocateReg f r = filter (/= r) f
 
-import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
-import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
-import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
-import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.PPC     as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC   as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86     as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64  as X86_64
+import qualified GHC.CmmToAsm.Reg.Linear.AArch64 as AArch64
 
-import qualified GHC.CmmToAsm.PPC.Instr   as PPC.Instr
-import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
-import qualified GHC.CmmToAsm.X86.Instr   as X86.Instr
+import qualified GHC.CmmToAsm.PPC.Instr     as PPC.Instr
+import qualified GHC.CmmToAsm.SPARC.Instr   as SPARC.Instr
+import qualified GHC.CmmToAsm.X86.Instr     as X86.Instr
+import qualified GHC.CmmToAsm.AArch64.Instr as AArch64.Instr
 
 class Show freeRegs => FR freeRegs where
     frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
@@ -58,6 +60,12 @@ instance FR PPC.FreeRegs where
     frInitFreeRegs = PPC.initFreeRegs
     frReleaseReg   = \_ -> PPC.releaseReg
 
+instance FR AArch64.FreeRegs where
+    frAllocateReg = \_ -> AArch64.allocateReg
+    frGetFreeRegs = \_ -> AArch64.getFreeRegs
+    frInitFreeRegs = AArch64.initFreeRegs
+    frReleaseReg = \_ -> AArch64.releaseReg
+
 instance FR SPARC.FreeRegs where
     frAllocateReg  = SPARC.allocateReg
     frGetFreeRegs  = \_ -> SPARC.getFreeRegs
@@ -73,7 +81,7 @@ maxSpillSlots config = case platformArch (ncgPlatform config) of
    ArchSPARC     -> SPARC.Instr.maxSpillSlots config
    ArchSPARC64   -> panic "maxSpillSlots ArchSPARC64"
    ArchARM _ _ _ -> panic "maxSpillSlots ArchARM"
-   ArchAArch64   -> panic "maxSpillSlots ArchAArch64"
+   ArchAArch64   -> AArch64.Instr.maxSpillSlots config
    ArchPPC_64 _  -> PPC.Instr.maxSpillSlots config
    ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
    ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
index d0330a4f6a..cbdf5d030b 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
@@ -26,6 +26,7 @@ import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
 import GHC.Data.Graph.Directed
 import GHC.Utils.Panic
+import GHC.Utils.Monad (concatMapM)
 import GHC.Types.Unique
 import GHC.Types.Unique.FM
 import GHC.Types.Unique.Set
@@ -306,7 +307,7 @@ handleComponent
 --      go via a spill slot.
 --
 handleComponent delta _  (AcyclicSCC (DigraphNode vreg src dsts))
-        = mapM (makeMove delta vreg src) dsts
+        = concatMapM (makeMove delta vreg src) dsts
 
 
 -- Handle some cyclic moves.
@@ -340,7 +341,7 @@ handleComponent delta instr
 
         -- make sure to do all the reloads after all the spills,
         --      so we don't end up clobbering the source values.
-        return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+        return (instrSpill ++ concat remainingFixUps ++ instrLoad)
 
 handleComponent _ _ (CyclicSCC _)
  = panic "Register Allocator: handleComponent cyclic"
@@ -354,7 +355,7 @@ makeMove
     -> Unique   -- ^ unique of the vreg that we're moving.
     -> Loc      -- ^ source location.
     -> Loc      -- ^ destination location.
-    -> RegM freeRegs instr  -- ^ move instruction.
+    -> RegM freeRegs [instr]  -- ^ move instruction.
 
 makeMove delta vreg src dst
  = do config <- getConfig
@@ -363,7 +364,7 @@ makeMove delta vreg src dst
       case (src, dst) of
           (InReg s, InReg d) ->
               do recordSpill (SpillJoinRR vreg)
-                 return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+                 return $ [mkRegRegMoveInstr platform (RegReal s) (RegReal d)]
           (InMem s, InReg d) ->
               do recordSpill (SpillJoinRM vreg)
                  return $ mkLoadInstr config (RegReal d) delta s
@@ -377,4 +378,3 @@ makeMove delta vreg src dst
               panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") ("
                   ++ show dst ++ ")"
                   ++ " we don't handle mem->mem moves.")
-
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
index 24a75121b8..ec1cd517ea 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -121,7 +121,7 @@ makeRAStats state
 
 
 spillR :: Instruction instr
-       => Reg -> Unique -> RegM freeRegs (instr, Int)
+       => Reg -> Unique -> RegM freeRegs ([instr], Int)
 
 spillR reg temp = mkRegM $ \s ->
   let (stack1,slot) = getStackSlotFor (ra_stack s) temp
@@ -131,7 +131,7 @@ spillR reg temp = mkRegM $ \s ->
 
 
 loadR :: Instruction instr
-      => Reg -> Int -> RegM freeRegs instr
+      => Reg -> Int -> RegM freeRegs [instr]
 
 loadR reg slot = mkRegM $ \s ->
   RA_Result s (mkLoadInstr (ra_config s) reg (ra_delta s) slot)
diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
index 4d70533624..ad8190270f 100644
--- a/compiler/GHC/CmmToAsm/Reg/Liveness.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
@@ -174,6 +174,8 @@ instance Instruction instr => Instruction (InstrSR instr) where
 
         pprInstr platform i = ppr (fmap (pprInstr platform) i)
 
+        mkComment               = fmap Instr . mkComment
+
 
 -- | An instruction with liveness information.
 data LiveInstr instr
@@ -565,16 +567,20 @@ stripLiveBlock config (BasicBlock i lis)
  where  (instrs', _)
                 = runState (spillNat [] lis) 0
 
+        -- spillNat :: [instr] -> [LiveInstr instr] -> State Int [instr]
+        spillNat :: Instruction instr => [instr] -> [LiveInstr instr] -> State Int [instr]
         spillNat acc []
          =      return (reverse acc)
 
+        -- The SPILL/RELOAD cases do not appear to be exercised by our codegens
+        --
         spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
          = do   delta   <- get
-                spillNat (mkSpillInstr config reg delta slot : acc) instrs
+                spillNat (mkSpillInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
          = do   delta   <- get
-                spillNat (mkLoadInstr config reg delta slot : acc) instrs
+                spillNat (mkLoadInstr config reg delta slot ++ acc) instrs
 
         spillNat acc (LiveInstr (Instr instr) _ : instrs)
          | Just i <- takeDeltaInstr instr
diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs
index 4611c3a8e8..22b22e21cc 100644
--- a/compiler/GHC/CmmToAsm/Reg/Target.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -34,6 +34,8 @@ import qualified GHC.CmmToAsm.X86.Regs       as X86
 import qualified GHC.CmmToAsm.X86.RegInfo    as X86
 import qualified GHC.CmmToAsm.PPC.Regs       as PPC
 import qualified GHC.CmmToAsm.SPARC.Regs     as SPARC
+import qualified GHC.CmmToAsm.AArch64.Regs   as AArch64
+
 
 targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
 targetVirtualRegSqueeze platform
@@ -46,7 +48,7 @@ targetVirtualRegSqueeze platform
       ArchSPARC64   -> panic "targetVirtualRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.virtualRegSqueeze
       ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetVirtualRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.virtualRegSqueeze
       ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
@@ -66,7 +68,7 @@ targetRealRegSqueeze platform
       ArchSPARC64   -> panic "targetRealRegSqueeze ArchSPARC64"
       ArchPPC_64 _  -> PPC.realRegSqueeze
       ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM"
-      ArchAArch64   -> panic "targetRealRegSqueeze ArchAArch64"
+      ArchAArch64   -> AArch64.realRegSqueeze
       ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
       ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
       ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
@@ -85,7 +87,7 @@ targetClassOfRealReg platform
       ArchSPARC64   -> panic "targetClassOfRealReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.classOfRealReg
       ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM"
-      ArchAArch64   -> panic "targetClassOfRealReg ArchAArch64"
+      ArchAArch64   -> AArch64.classOfRealReg
       ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
       ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
       ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
@@ -104,7 +106,7 @@ targetMkVirtualReg platform
       ArchSPARC64   -> panic "targetMkVirtualReg ArchSPARC64"
       ArchPPC_64 _  -> PPC.mkVirtualReg
       ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM"
-      ArchAArch64   -> panic "targetMkVirtualReg ArchAArch64"
+      ArchAArch64   -> AArch64.mkVirtualReg
       ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
       ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
       ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
@@ -123,7 +125,7 @@ targetRegDotColor platform
       ArchSPARC64   -> panic "targetRegDotColor ArchSPARC64"
       ArchPPC_64 _  -> PPC.regDotColor
       ArchARM _ _ _ -> panic "targetRegDotColor ArchARM"
-      ArchAArch64   -> panic "targetRegDotColor ArchAArch64"
+      ArchAArch64   -> AArch64.regDotColor
       ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
       ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
       ArchMipsel    -> panic "targetRegDotColor ArchMipsel"
diff --git a/compiler/GHC/CmmToAsm/SPARC.hs b/compiler/GHC/CmmToAsm/SPARC.hs
index 7d9a671932..cac72de6d3 100644
--- a/compiler/GHC/CmmToAsm/SPARC.hs
+++ b/compiler/GHC/CmmToAsm/SPARC.hs
@@ -69,7 +69,6 @@ instance Instruction SPARC.Instr where
    takeRegRegMoveInstr     = SPARC.takeRegRegMoveInstr
    mkJumpInstr             = SPARC.mkJumpInstr
    pprInstr                = SPARC.pprInstr
+   mkComment               = const []
    mkStackAllocInstr       = panic "no sparc_mkStackAllocInstr"
    mkStackDeallocInstr     = panic "no sparc_mkStackDeallocInstr"
-
-
diff --git a/compiler/GHC/CmmToAsm/SPARC/Instr.hs b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
index 3aeeb4d976..a5c9e46936 100644
--- a/compiler/GHC/CmmToAsm/SPARC/Instr.hs
+++ b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
@@ -362,7 +362,7 @@ mkSpillInstr
     -> Reg      -- ^ register to spill
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkSpillInstr config reg _ slot
  = let  platform = ncgPlatform config
@@ -373,7 +373,7 @@ mkSpillInstr config reg _ slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-    in ST fmt reg (fpRel (negate off_w))
+    in [ST fmt reg (fpRel (negate off_w))]
 
 
 -- | Make a spill reload instruction.
@@ -382,7 +382,7 @@ mkLoadInstr
     -> Reg      -- ^ register to load into
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkLoadInstr config reg _ slot
   = let platform = ncgPlatform config
@@ -393,7 +393,7 @@ mkLoadInstr config reg _ slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-        in LD fmt (fpRel (- off_w)) reg
+        in [LD fmt (fpRel (- off_w)) reg]
 
 
 --------------------------------------------------------------------------------
diff --git a/compiler/GHC/CmmToAsm/X86.hs b/compiler/GHC/CmmToAsm/X86.hs
index dbeeddc184..3d9780a99c 100644
--- a/compiler/GHC/CmmToAsm/X86.hs
+++ b/compiler/GHC/CmmToAsm/X86.hs
@@ -62,4 +62,4 @@ instance Instruction X86.Instr where
    mkStackAllocInstr       = X86.mkStackAllocInstr
    mkStackDeallocInstr     = X86.mkStackDeallocInstr
    pprInstr                = X86.pprInstr
-
+   mkComment               = const []
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index e48d0922d8..9410537ed8 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -672,15 +672,15 @@ mkSpillInstr
     -> Reg      -- register to spill
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkSpillInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
     case targetClassOfReg platform reg of
-           RcInteger   -> MOV (archWordFormat is32Bit)
-                              (OpReg reg) (OpAddr (spRel platform off))
-           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel platform off))
+           RcInteger   -> [MOV (archWordFormat is32Bit)
+                                   (OpReg reg) (OpAddr (spRel platform off))]
+           RcDouble    -> [MOV FF64 (OpReg reg) (OpAddr (spRel platform off))]
            _         -> panic "X86.mkSpillInstr: no match"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform
@@ -691,16 +691,16 @@ mkLoadInstr
     -> Reg      -- register to load
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 mkLoadInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
         case targetClassOfReg platform reg of
-              RcInteger -> MOV (archWordFormat is32Bit)
-                               (OpAddr (spRel platform off)) (OpReg reg)
-              RcDouble  -> MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)
-              _           -> panic "X86.mkLoadInstr"
+              RcInteger -> ([MOV (archWordFormat is32Bit)
+                                 (OpAddr (spRel platform off)) (OpReg reg)])
+              RcDouble  -> ([MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)])
+              _         -> panic "X86.mkLoadInstr"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform
 
diff --git a/compiler/GHC/Driver/Backend.hs b/compiler/GHC/Driver/Backend.hs
index 39789607d9..71cfdd9dec 100644
--- a/compiler/GHC/Driver/Backend.hs
+++ b/compiler/GHC/Driver/Backend.hs
@@ -104,6 +104,7 @@ platformNcgSupported platform = if
          ArchPPC       -> True
          ArchPPC_64 {} -> True
          ArchSPARC     -> True
+         ArchAArch64   -> True
          _             -> False
 
 -- | Will this backend produce an object file on the disk?
diff --git a/compiler/GHC/Linker/Static.hs b/compiler/GHC/Linker/Static.hs
index 4e2367f9e6..b0b8f1c541 100644
--- a/compiler/GHC/Linker/Static.hs
+++ b/compiler/GHC/Linker/Static.hs
@@ -222,7 +222,7 @@ linkBinary' staticLink logger tmpfs dflags unit_env o_files dep_units = do
                              not staticLink &&
                              (platformOS platform == OSDarwin) &&
                              case platformArch platform of
-                               ArchX86 -> True
+                               ArchX86     -> True
                                ArchX86_64  -> True
                                ArchARM {}  -> True
                                ArchAArch64 -> True
diff --git a/compiler/GHC/Platform.hs b/compiler/GHC/Platform.hs
index b3ab1b4020..70c8f9b658 100644
--- a/compiler/GHC/Platform.hs
+++ b/compiler/GHC/Platform.hs
@@ -40,6 +40,7 @@ module GHC.Platform
    , platformSOName
    , platformHsSOName
    , platformSOExt
+   , genericPlatform
    )
 where
 
@@ -83,6 +84,21 @@ platformConstants platform = case platform_constants platform of
   Nothing -> panic "Platform constants not available!"
   Just c  -> c
 
+genericPlatform :: Platform
+genericPlatform = Platform
+   { platformArchOS                  = ArchOS ArchX86_64 OSLinux
+   , platformWordSize                = PW8
+   , platformByteOrder               = LittleEndian
+   , platformUnregisterised          = False
+   , platformHasGnuNonexecStack      = False
+   , platformHasIdentDirective       = False
+   , platformHasSubsectionsViaSymbols= False
+   , platformIsCrossCompiling        = False
+   , platformLeadingUnderscore       = False
+   , platformTablesNextToCode        = True
+   , platform_constants               = Nothing
+   }
+
 data PlatformWordSize
   = PW4 -- ^ A 32-bit platform
   | PW8 -- ^ A 64-bit platform
@@ -223,7 +239,6 @@ data BmiVersion
    | BMI2
    deriving (Eq, Ord)
 
-
 -- | Platform-specific settings formerly hard-coded in Config.hs.
 --
 -- These should probably be all be triaged whether they can be computed from
diff --git a/compiler/GHC/Platform/Reg.hs b/compiler/GHC/Platform/Reg.hs
index c011a59eb0..5edd39df51 100644
--- a/compiler/GHC/Platform/Reg.hs
+++ b/compiler/GHC/Platform/Reg.hs
@@ -181,7 +181,7 @@ realRegsAlias rr1 rr2
 data Reg
         = RegVirtual !VirtualReg
         | RegReal    !RealReg
-        deriving (Eq, Ord)
+        deriving (Eq, Ord, Show)
 
 regSingle :: RegNo -> Reg
 regSingle regNo = RegReal (realRegSingle regNo)
diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs
index 39f25a7b86..78a01d06d6 100644
--- a/compiler/GHC/StgToCmm/Foreign.hs
+++ b/compiler/GHC/StgToCmm/Foreign.hs
@@ -84,6 +84,7 @@ cgForeignCall (CCall (CCallSpec target cconv safety)) typ stg_args res_ty
             arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType platform arg)
                                      (platformWordSizeInBytes platform)
         ; cmm_args <- getFCallArgs stg_args typ
+        -- ; traceM $ show cmm_args
         ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
         ; let ((call_args, arg_hints), cmm_target)
                 = case target of
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 40cfde0d3a..5d8d1f9b22 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -203,6 +203,13 @@ Library
         GHC.Cmm.Switch.Implement
         GHC.CmmToAsm
         GHC.Cmm.LRegSet
+        GHC.CmmToAsm.AArch64
+        GHC.CmmToAsm.AArch64.CodeGen
+        GHC.CmmToAsm.AArch64.Cond
+        GHC.CmmToAsm.AArch64.Instr
+        GHC.CmmToAsm.AArch64.Ppr
+        GHC.CmmToAsm.AArch64.RegInfo
+        GHC.CmmToAsm.AArch64.Regs
         GHC.CmmToAsm.BlockLayout
         GHC.CmmToAsm.CFG
         GHC.CmmToAsm.CFG.Dominators
@@ -234,6 +241,7 @@ Library
         GHC.CmmToAsm.Reg.Graph.TrivColorable
         GHC.CmmToAsm.Reg.Graph.X86
         GHC.CmmToAsm.Reg.Linear
+        GHC.CmmToAsm.Reg.Linear.AArch64
         GHC.CmmToAsm.Reg.Linear.Base
         GHC.CmmToAsm.Reg.Linear.FreeRegs
         GHC.CmmToAsm.Reg.Linear.JoinToTargets
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 8c942662e6..0dfac62a3f 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -1,7 +1,8 @@
 
 import GHC.Cmm.Expr
 #if !(defined(MACHREGS_i386) || defined(MACHREGS_x86_64) \
-    || defined(MACHREGS_sparc) || defined(MACHREGS_powerpc))
+    || defined(MACHREGS_sparc) || defined(MACHREGS_powerpc) \
+    || defined(MACHREGS_aarch64))
 import GHC.Utils.Panic.Plain
 #endif
 import GHC.Platform.Reg
@@ -1016,6 +1017,98 @@ freeReg REG_HpLim = False
 # endif
 freeReg _ = True
 
+#elif defined(MACHREGS_aarch64)
+
+-- stack pointer / zero reg
+freeReg 31 = False
+-- link register
+freeReg 30 = False
+-- frame pointer
+freeReg 29 = False
+-- ip0 -- used for spill offset computations
+freeReg 16 = False
+
+# if defined(REG_Base)
+freeReg REG_Base  = False
+# endif
+# if defined(REG_Sp)
+freeReg REG_Sp    = False
+# endif
+# if defined(REG_SpLim)
+freeReg REG_SpLim = False
+# endif
+# if defined(REG_Hp)
+freeReg REG_Hp    = False
+# endif
+# if defined(REG_HpLim)
+freeReg REG_HpLim = False
+# endif
+
+# if defined(REG_R1)
+freeReg REG_R1    = False
+# endif
+# if defined(REG_R2)
+freeReg REG_R2    = False
+# endif
+# if defined(REG_R3)
+freeReg REG_R3    = False
+# endif
+# if defined(REG_R4)
+freeReg REG_R4    = False
+# endif
+# if defined(REG_R5)
+freeReg REG_R5    = False
+# endif
+# if defined(REG_R6)
+freeReg REG_R6    = False
+# endif
+# if defined(REG_R7)
+freeReg REG_R7    = False
+# endif
+# if defined(REG_R8)
+freeReg REG_R8    = False
+# endif
+
+# if defined(REG_F1)
+freeReg REG_F1    = False
+# endif
+# if defined(REG_F2)
+freeReg REG_F2    = False
+# endif
+# if defined(REG_F3)
+freeReg REG_F3    = False
+# endif
+# if defined(REG_F4)
+freeReg REG_F4    = False
+# endif
+# if defined(REG_F5)
+freeReg REG_F5    = False
+# endif
+# if defined(REG_F6)
+freeReg REG_F6    = False
+# endif
+
+# if defined(REG_D1)
+freeReg REG_D1    = False
+# endif
+# if defined(REG_D2)
+freeReg REG_D2    = False
+# endif
+# if defined(REG_D3)
+freeReg REG_D3    = False
+# endif
+# if defined(REG_D4)
+freeReg REG_D4    = False
+# endif
+# if defined(REG_D5)
+freeReg REG_D5    = False
+# endif
+# if defined(REG_D6)
+freeReg REG_D6    = False
+# endif
+
+freeReg _ = True
+
 #elif defined(MACHREGS_sparc)
 
 -- SPARC regs used by the OS / ABI
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 12f0e32f0f..0258811728 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -208,7 +208,6 @@ void flushExec(W_ len, AdjustorExecutable exec_addr);
 #if defined(darwin_HOST_OS)
 AdjustorWritable execToWritable(AdjustorExecutable exec);
 #endif
-
 #if RTS_LINKER_USE_MMAP
 AdjustorWritable allocateWrite(W_ bytes);
 void markExec(W_ bytes, AdjustorWritable writ);
diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h
index 4b0991891e..d50969b66a 100644
--- a/includes/stg/MachRegs.h
+++ b/includes/stg/MachRegs.h
@@ -548,7 +548,7 @@ the stack. See Note [Overlapping global registers] for implications.
      r30    |  LR     | The Link Register
      r29    |  FP     | The Frame Pointer
    r19-r28  |         | Callee-saved registers
-     r18    |         | The Platform Register, if needed; 
+     r18    |         | The Platform Register, if needed;
             |         | or temporary register
      r17    |  IP1    | The second intra-procedure-call temporary register
      r16    |  IP0    | The first intra-procedure-call scratch register
diff --git a/mk/flavours/devel-cross-ncg.mk b/mk/flavours/devel-cross-ncg.mk
new file mode 100644
index 0000000000..0d5325dd25
--- /dev/null
+++ b/mk/flavours/devel-cross-ncg.mk
@@ -0,0 +1,18 @@
+SRC_HC_OPTS        = -O0 -H64m
+GhcStage1HcOpts    = -O2 -DDEBUG
+GhcStage2HcOpts    = -O0
+GhcLibHcOpts       = -O
+BUILD_PROF_LIBS    = NO
+SplitSections      = NO
+HADDOCK_DOCS       = NO
+BUILD_SPHINX_HTML  = NO
+BUILD_SPHINX_PDF   = NO
+BUILD_MAN          = NO
+WITH_TERMINFO      = NO
+
+BIGNUM_BACKEND       = native
+Stage1Only           = YES
+DYNAMIC_BY_DEFAULT   = NO
+DYNAMIC_GHC_PROGRAMS = NO
+
+libraries/Cabal_dist-install_HC_OPTS += -O0
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index 2380ac7135..993af91528 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -1020,6 +1020,7 @@
       SymI_HasProto(registerInfoProvList)                               \
       SymI_HasProto(lookupIPE)                                          \
       RTS_USER_SIGNALS_SYMBOLS                                          \
+      RTS_LINKER_USE_MMAP_SYMBOLS                                       \
       RTS_INTCHAR_SYMBOLS
 
 
diff --git a/rts/linker/Elf.c b/rts/linker/Elf.c
index 84cb72bd6b..f6a1754257 100644
--- a/rts/linker/Elf.c
+++ b/rts/linker/Elf.c
@@ -774,9 +774,9 @@ ocGetNames_ELF ( ObjectCode* oc )
 
           void * mem = mmapAnonForLinker(size+stub_space);
 
-          if( mem == NULL ) {
-              barf("failed to mmap allocated memory to load section %d. "
-                   "errno = %d", i, errno);
+          if( mem == MAP_FAILED ) {
+            barf("failed to mmap allocated memory to load section %d. "
+                 "errno = %d", i, errno);
           }
 
           /* copy only the image part over; we don't want to copy data
diff --git a/testsuite/tests/ghci/linking/all.T b/testsuite/tests/ghci/linking/all.T
index 1739129a6a..5802c907c2 100644
--- a/testsuite/tests/ghci/linking/all.T
+++ b/testsuite/tests/ghci/linking/all.T
@@ -14,6 +14,9 @@ test('ghcilink003',
      [ unless(doing_ghci, skip),
        # libstdc++ is GCC-specific on FreeBSD. FreeBSD has libc++ though.
        when(opsys('freebsd'), fragile(17739)),
+       # from Big Sur onwards, we can't dlopen libstdc++.dylib
+       # anymore. Will produce:
+       #   dlopen(libstdc++.dylib, 5): image not found
        when(opsys('darwin'), fragile(16083))
      ], makefile_test, ['ghcilink003'])
 
@@ -34,6 +37,9 @@ test('ghcilink006',
      [ unless(doing_ghci, skip),
        # libstdc++ is GCC-specific on FreeBSD. FreeBSD has libc++ though.
        when(opsys('freebsd'), fragile(17739)),
+       # from Big Sur onwards, we can't dlopen libstdc++.dylib
+       # anymore. Will produce:
+       #   dlopen(libstdc++.dylib, 5): image not found
        when(opsys('darwin'), fragile(16083))
      ], makefile_test, ['ghcilink006'])
 
diff --git a/testsuite/tests/llvm/should_compile/all.T b/testsuite/tests/llvm/should_compile/all.T
index dca858dec8..4fb98dac7b 100644
--- a/testsuite/tests/llvm/should_compile/all.T
+++ b/testsuite/tests/llvm/should_compile/all.T
@@ -5,11 +5,19 @@ def f( name, opts ):
 
 setTestOpts(f)
 
+# Apples LLVM Toolchain knows about a `vortex` cpu (and possibly others), that
+# the stock LLVM toolchain doesn't know abotu and will warn about. Let's not
+# have test fail just because of processor name differences due to different
+# LLVM Toolchains.  GHC tries to pass what apple expects (on darwin), but can
+# be used with the stock LLVM toolchain as well.
+def ignore_llvm_and_vortex( msg ):
+     return re.sub(r".* is not a recognized processor for this target.*\n",r"",msg)
+
 # test('T5486', normal, compile, [''])
-test('T5681', normal, compile, [''])
+test('T5681', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
 test('T6158', [reqlib('vector'), reqlib('primitive')], compile, ['-package vector -package primitive'])
-test('T7571', cmm_src, compile, ['-no-hs-main'])
+test('T7571', [cmm_src, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, ['-no-hs-main'])
 test('T7575', unless(wordsize(32), skip), compile, [''])
-test('T8131b', normal, compile, [''])
-test('T11649', normal, compile, [''])
+test('T8131b', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
+test('T11649', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile, [''])
 test('T17920fail', cmm_src, compile_fail, ['-no-hs-main'])
diff --git a/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T b/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
index 16a30e6f0f..22d7cb2b42 100644
--- a/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
+++ b/testsuite/tests/llvm/should_run/subsections_via_symbols/all.T
@@ -3,9 +3,15 @@
 #
 # Please refer to https://gitlab.haskell.org/ghc/ghc/issues/5019
 # for the subsections_via_symbols.stderr
+def ignore_llvm_and_vortex( msg ):
+     return re.sub(r"You are using an unsupported version of LLVM!.*\n",r"",
+               re.sub(r"Currently only [^ ]* is supported. System LLVM version: .*\n", r"",
+                    re.sub(r"We will try though.*\n",r"",
+                         re.sub(r".* is not a recognized processor for this target.*\n",r"",msg))))
 
 test('subsections_via_symbols',
      [when(not opsys('darwin'), skip),
       only_ways(['optllvm', 'llvm', 'debugllvm']),
-      extra_files(['SubsectionsViaSymbols.hs'])],
+      extra_files(['SubsectionsViaSymbols.hs']),
+      normalise_errmsg_fun(ignore_llvm_and_vortex)],
      makefile_test, [])
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 085e4b1f12..cd4a92dd7b 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -37,6 +37,7 @@ test('derefnull',
       # The output under OS X is too unstable to readily compare
       when(platform('i386-apple-darwin'), [ignore_stderr, exit_code(139)]),
       when(platform('x86_64-apple-darwin'), [ignore_stderr, exit_code(139)]),
+      when(platform('aarch64-apple-darwin'), [ignore_stderr, exit_code(139)]),
       when(opsys('mingw32'), [ignore_stderr, exit_code(11)]),
       when(opsys('mingw32'), [fragile(18548)]),
       # ThreadSanitizer changes the output