summaryrefslogtreecommitdiff
path: root/compiler/nativeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/nativeGen/X86')
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs247
-rw-r--r--compiler/nativeGen/X86/Cond.hs2
-rw-r--r--compiler/nativeGen/X86/Instr.hs161
-rw-r--r--compiler/nativeGen/X86/Ppr.hs149
-rw-r--r--compiler/nativeGen/X86/RegInfo.hs2
-rw-r--r--compiler/nativeGen/X86/Regs.hs22
6 files changed, 488 insertions, 95 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 341fa43dbc..a2e26bd68b 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -2,9 +2,7 @@
-- The default iteration limit is a bit too low for the definitions
-- in this module.
-#if __GLASGOW_HASKELL__ >= 800
{-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
-#endif
-----------------------------------------------------------------------------
--
@@ -32,6 +30,8 @@ where
#include "../includes/MachDeps.h"
-- NCG stuff:
+import GhcPrelude
+
import X86.Instr
import X86.Cond
import X86.Regs
@@ -65,7 +65,6 @@ import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
import ForeignCall ( CCallConv(..) )
import OrdList
import Outputable
-import Unique
import FastString
import DynFlags
import Util
@@ -211,6 +210,9 @@ stmtToInstrs stmt = do
-> genCCall dflags is32Bit target result_regs args
CmmBranch id -> genBranch id
+
+ --We try to arrange blocks such that the likely branch is the fallthrough
+ --in CmmContFlowOpt. So we can assume the condition is likely false here.
CmmCondBranch arg true false _ -> do
b1 <- genCondJump true arg
b2 <- genBranch false
@@ -295,7 +297,7 @@ data Amode
= Amode AddrMode InstrBlock
{-
-Now, given a tree (the argument to an CmmLoad) that references memory,
+Now, given a tree (the argument to a CmmLoad) that references memory,
produce a suitable addressing mode.
A Rule of the Game (tm) for Amodes: use of the addr bit must
@@ -328,7 +330,7 @@ is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
- where blockLabel = mkAsmTempLabel (getUnique blockid)
+ where blockLabel = blockLbl blockid
-- -----------------------------------------------------------------------------
@@ -466,6 +468,20 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
r_dst_lo
)
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+ fn <- getAnyReg expr
+ r_dst_lo <- getNewRegNat II32
+ let r_dst_hi = getHiVRegFromLo r_dst_lo
+ code = fn r_dst_lo
+ return (
+ ChildCode64 (code `snocOL`
+ MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+ CLTD II32 `snocOL`
+ MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+ MOV II32 (OpReg edx) (OpReg r_dst_hi))
+ r_dst_lo
+ )
+
iselExpr64 expr
= pprPanic "iselExpr64(i386)" (ppr expr)
@@ -503,6 +519,9 @@ getRegister' dflags is32Bit (CmmReg reg)
getRegister' dflags is32Bit (CmmRegOff r n)
= getRegister' dflags is32Bit $ mangleIndexTree dflags r n
+getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
+ = addAlignmentCheck align <$> getRegister' dflags is32Bit e
+
-- for 32-bit architectuers, support some 64 -> 32 bit conversions:
-- TO_W_(x), TO_W_(x >> 32)
@@ -731,8 +750,10 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_F_Ne _ -> condFltReg is32Bit NE x y
MO_F_Gt _ -> condFltReg is32Bit GTT x y
MO_F_Ge _ -> condFltReg is32Bit GE x y
- MO_F_Lt _ -> condFltReg is32Bit LTT x y
- MO_F_Le _ -> condFltReg is32Bit LE x y
+ -- Invert comparison condition and swap operands
+ -- See Note [SSE Parity Checks]
+ MO_F_Lt _ -> condFltReg is32Bit GTT y x
+ MO_F_Le _ -> condFltReg is32Bit GE y x
MO_Eq _ -> condIntReg EQQ x y
MO_Ne _ -> condIntReg NE x y
@@ -1255,6 +1276,21 @@ isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit
|| isSuitableFloatingPointLit lit
isOperand _ _ = False
+-- | Given a 'Register', produce a new 'Register' with an instruction block
+-- which will check the value for alignment. Used for @-falignment-sanitisation@.
+addAlignmentCheck :: Int -> Register -> Register
+addAlignmentCheck align reg =
+ case reg of
+ Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
+ Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg)
+ where
+ check :: Format -> Reg -> InstrBlock
+ check fmt reg =
+ ASSERT(not $ isFloatFormat fmt)
+ toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
+ , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
+ ]
+
memConstant :: Int -> CmmLit -> NatM Amode
memConstant align lit = do
lbl <- getNewLabelNat
@@ -1331,15 +1367,17 @@ getCondCode (CmmMachOp mop [x, y])
MO_F_Ne W32 -> condFltCode NE x y
MO_F_Gt W32 -> condFltCode GTT x y
MO_F_Ge W32 -> condFltCode GE x y
- MO_F_Lt W32 -> condFltCode LTT x y
- MO_F_Le W32 -> condFltCode LE x y
+ -- Invert comparison condition and swap operands
+ -- See Note [SSE Parity Checks]
+ MO_F_Lt W32 -> condFltCode GTT y x
+ MO_F_Le W32 -> condFltCode GE y x
MO_F_Eq W64 -> condFltCode EQQ x y
MO_F_Ne W64 -> condFltCode NE x y
MO_F_Gt W64 -> condFltCode GTT x y
MO_F_Ge W64 -> condFltCode GE x y
- MO_F_Lt W64 -> condFltCode LTT x y
- MO_F_Le W64 -> condFltCode LE x y
+ MO_F_Lt W64 -> condFltCode GTT y x
+ MO_F_Le W64 -> condFltCode GE y x
_ -> condIntCode (machOpToCond mop) x y
@@ -1639,11 +1677,19 @@ genCondJump' _ id bool = do
else do
lbl <- getBlockIdNat
- -- see comment with condFltReg
+ -- See Note [SSE Parity Checks]
let code = case cond of
NE -> or_unordered
GU -> plain_test
GEU -> plain_test
+ -- Use ASSERT so we don't break releases if
+ -- LTT/LE creep in somehow.
+ LTT ->
+ ASSERT2(False, ppr "Should have been turned into >")
+ and_ordered
+ LE ->
+ ASSERT2(False, ppr "Should have been turned into >=")
+ and_ordered
_ -> and_ordered
plain_test = unitOL (
@@ -1855,6 +1901,72 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
+genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+ args@[src, mask] = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PDEP instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
+ else
+ unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
+
+genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+ args@[src, mask] = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PEXT instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
+ else
+ unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
+
genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
| is32Bit && width == W64 = do
-- Fallback to `hs_clz64` on i386
@@ -2129,6 +2241,8 @@ genCCall _ is32Bit target dest_regs args = do
ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
return code
_ -> panic "genCCall: Wrong number of arguments/results for add2"
+ (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
+ addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
(PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
(PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
@@ -2645,6 +2759,10 @@ outOfLineCmmOp mop res args
MO_F32_Tanh -> fsLit "tanhf"
MO_F32_Pwr -> fsLit "powf"
+ MO_F32_Asinh -> fsLit "asinhf"
+ MO_F32_Acosh -> fsLit "acoshf"
+ MO_F32_Atanh -> fsLit "atanhf"
+
MO_F64_Sqrt -> fsLit "sqrt"
MO_F64_Fabs -> fsLit "fabs"
MO_F64_Sin -> fsLit "sin"
@@ -2662,15 +2780,23 @@ outOfLineCmmOp mop res args
MO_F64_Tanh -> fsLit "tanh"
MO_F64_Pwr -> fsLit "pow"
+ MO_F64_Asinh -> fsLit "asinh"
+ MO_F64_Acosh -> fsLit "acosh"
+ MO_F64_Atanh -> fsLit "atanh"
+
MO_Memcpy _ -> fsLit "memcpy"
MO_Memset _ -> fsLit "memset"
MO_Memmove _ -> fsLit "memmove"
+ MO_Memcmp _ -> fsLit "memcmp"
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz _ -> unsupported
+ MO_Pdep w -> fsLit $ pdepLabel w
+ MO_Pext w -> fsLit $ pextLabel w
+
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
MO_AtomicWrite _ -> fsLit "atomicwrite"
@@ -2684,6 +2810,7 @@ outOfLineCmmOp mop res args
MO_Add2 {} -> unsupported
MO_AddIntC {} -> unsupported
MO_SubIntC {} -> unsupported
+ MO_AddWordC {} -> unsupported
MO_SubWordC {} -> unsupported
MO_U_Mul2 {} -> unsupported
MO_WriteBarrier -> unsupported
@@ -2698,7 +2825,7 @@ outOfLineCmmOp mop res args
genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
genSwitch dflags expr targets
- | gopt Opt_PIC dflags
+ | positionIndependent dflags
= do
(reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
-- getNonClobberedReg because it needs to survive across t_code
@@ -2750,23 +2877,29 @@ genSwitch dflags expr targets
JMP_TBL op ids (Section ReadOnlyData lbl) lbl
]
return code
- where (offset, ids) = switchTargetsToTable targets
+ where
+ (offset, blockIds) = switchTargetsToTable targets
+ ids = map (fmap DestBlockId) blockIds
generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, CmmStatics) Instr)
generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
- = Just (createJumpTable dflags ids section lbl)
+ = let getBlockId (DestBlockId id) = id
+ getBlockId _ = panic "Non-Label target in Jump Table"
+ blockIds = map (fmap getBlockId) ids
+ in Just (createJumpTable dflags blockIds section lbl)
generateJumpTableForInstr _ _ = Nothing
createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
-> GenCmmDecl (Alignment, CmmStatics) h g
createJumpTable dflags ids section lbl
= let jumpTable
- | gopt Opt_PIC dflags =
- let jumpTableEntryRel Nothing
- = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+ | positionIndependent dflags =
+ let ww = wordWidth dflags
+ jumpTableEntryRel Nothing
+ = CmmStaticLit (CmmInt 0 ww)
jumpTableEntryRel (Just blockid)
- = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
- where blockLabel = mkAsmTempLabel (getUnique blockid)
+ = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
+ where blockLabel = blockLbl blockid
in map jumpTableEntryRel ids
| otherwise = map (jumpTableEntry dflags) ids
in CmmData section (1, Statics lbl jumpTable)
@@ -2797,6 +2930,59 @@ condIntReg cond x y = do
return (Any II32 code)
+-----------------------------------------------------------
+--- Note [SSE Parity Checks] ---
+-----------------------------------------------------------
+
+-- We have to worry about unordered operands (eg. comparisons
+-- against NaN). If the operands are unordered, the comparison
+-- sets the parity flag, carry flag and zero flag.
+-- All comparisons are supposed to return false for unordered
+-- operands except for !=, which returns true.
+--
+-- Optimisation: we don't have to test the parity flag if we
+-- know the test has already excluded the unordered case: eg >
+-- and >= test for a zero carry flag, which can only occur for
+-- ordered operands.
+--
+-- By reversing comparisons we can avoid testing the parity
+-- for < and <= as well. If any of the arguments is an NaN we
+-- return false either way. If both arguments are valid then
+-- x <= y <-> y >= x holds. So it's safe to swap these.
+--
+-- We invert the condition inside getRegister'and getCondCode
+-- which should cover all invertable cases.
+-- All other functions translating FP comparisons to assembly
+-- use these to two generate the comparison code.
+--
+-- As an example consider a simple check:
+--
+-- func :: Float -> Float -> Int
+-- func x y = if x < y then 1 else 0
+--
+-- Which in Cmm gives the floating point comparison.
+--
+-- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
+--
+-- We used to compile this to an assembly code block like this:
+-- _c2gh:
+-- ucomiss %xmm2,%xmm1
+-- jp _c2gf
+-- jb _c2gg
+-- jmp _c2gf
+--
+-- Where we have to introduce an explicit
+-- check for unordered results (using jmp parity):
+--
+-- We can avoid this by exchanging the arguments and inverting the direction
+-- of the comparison. This results in the sequence of:
+--
+-- ucomiss %xmm1,%xmm2
+-- ja _c2g2
+-- jmp _c2g1
+--
+-- Removing the jump reduces the pressure on the branch predidiction system
+-- and plays better with the uOP cache.
condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
@@ -2815,27 +3001,18 @@ condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
CondCode _ cond cond_code <- condFltCode cond x y
tmp1 <- getNewRegNat (archWordFormat is32Bit)
tmp2 <- getNewRegNat (archWordFormat is32Bit)
- let
- -- We have to worry about unordered operands (eg. comparisons
- -- against NaN). If the operands are unordered, the comparison
- -- sets the parity flag, carry flag and zero flag.
- -- All comparisons are supposed to return false for unordered
- -- operands except for !=, which returns true.
- --
- -- Optimisation: we don't have to test the parity flag if we
- -- know the test has already excluded the unordered case: eg >
- -- and >= test for a zero carry flag, which can only occur for
- -- ordered operands.
- --
- -- ToDo: by reversing comparisons we could avoid testing the
- -- parity flag in more cases.
-
+ let -- See Note [SSE Parity Checks]
code dst =
cond_code `appOL`
(case cond of
NE -> or_unordered dst
GU -> plain_test dst
GEU -> plain_test dst
+ -- Use ASSERT so we don't break releases if these creep in.
+ LTT -> ASSERT2(False, ppr "Should have been turned into >")
+ and_ordered dst
+ LE -> ASSERT2(False, ppr "Should have been turned into >=")
+ and_ordered dst
_ -> and_ordered dst)
plain_test dst = toOL [
diff --git a/compiler/nativeGen/X86/Cond.hs b/compiler/nativeGen/X86/Cond.hs
index 586dabd8f4..35cbf943e1 100644
--- a/compiler/nativeGen/X86/Cond.hs
+++ b/compiler/nativeGen/X86/Cond.hs
@@ -8,6 +8,8 @@ module X86.Cond (
where
+import GhcPrelude
+
data Cond
= ALWAYS -- What's really used? ToDo
| EQQ
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 71f50e9d2a..c7000c9f4b 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -8,7 +8,7 @@
--
-----------------------------------------------------------------------------
-module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest,
+module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
getJumpDestBlockId, canShortcut, shortcutStatics,
shortcutJump, i386_insert_ffrees, allocMoreStack,
maxSpillSlots, archWordFormat)
@@ -17,6 +17,8 @@ where
#include "HsVersions.h"
#include "nativeGen/NCG.h"
+import GhcPrelude
+
import X86.Cond
import X86.Regs
import Instruction
@@ -320,7 +322,7 @@ data Instr
| JXX_GBL Cond Imm -- non-local version of JXX
-- Table jump
| JMP_TBL Operand -- Address to jump to
- [Maybe BlockId] -- Blocks in the jump table
+ [Maybe JumpDest] -- Targets of the jump table
Section -- Data section jump table should be put in
CLabel -- Label of jump table
| CALL (Either Imm Reg) [Reg]
@@ -343,6 +345,10 @@ data Instr
| BSF Format Operand Reg -- bit scan forward
| BSR Format Operand Reg -- bit scan reverse
+ -- bit manipulation instructions
+ | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
+ | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
+
-- prefetch
| PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
@@ -462,6 +468,9 @@ x86_regUsageOfInstr platform instr
BSF _ src dst -> mkRU (use_R src []) [dst]
BSR _ src dst -> mkRU (use_R src []) [dst]
+ PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+ PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
LOCK i -> x86_regUsageOfInstr platform i
@@ -638,6 +647,8 @@ x86_patchRegsOfInstr instr env
CLTD _ -> instr
POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+ PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
+ PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
@@ -693,7 +704,7 @@ x86_jumpDestsOfInstr
x86_jumpDestsOfInstr insn
= case insn of
JXX _ id -> [id]
- JMP_TBL _ ids _ _ -> [id | Just id <- ids]
+ JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
_ -> []
@@ -704,8 +715,12 @@ x86_patchJumpInstr insn patchF
= case insn of
JXX cc id -> JXX cc (patchF id)
JMP_TBL op ids section lbl
- -> JMP_TBL op (map (fmap patchF) ids) section lbl
+ -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
_ -> insn
+ where
+ patchJumpDest f (DestBlockId id) = DestBlockId (f id)
+ patchJumpDest _ dest = dest
+
@@ -843,25 +858,104 @@ x86_mkJumpInstr
x86_mkJumpInstr id
= [JXX ALWAYS id]
+-- Note [Windows stack layout]
+-- | On most OSes the kernel will place a guard page after the current stack
+-- page. If you allocate larger than a page worth you may jump over this
+-- guard page. Not only is this a security issue, but on certain OSes such
+-- as Windows a new page won't be allocated if you don't hit the guard. This
+-- will cause a segfault or access fault.
+--
+-- This function defines if the current allocation amount requires a probe.
+-- On Windows (for now) we emit a call to _chkstk for this. For other OSes
+-- this is not yet implemented.
+-- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
+-- The Windows stack looks like this:
+--
+-- +-------------------+
+-- | SP |
+-- +-------------------+
+-- | |
+-- | GUARD PAGE |
+-- | |
+-- +-------------------+
+-- | |
+-- | |
+-- | UNMAPPED |
+-- | |
+-- | |
+-- +-------------------+
+--
+-- In essense each allocation larger than a page size needs to be chunked and
+-- a probe emitted after each page allocation. You have to hit the guard
+-- page so the kernel can map in the next page, otherwise you'll segfault.
+--
+needs_probe_call :: Platform -> Int -> Bool
+needs_probe_call platform amount
+ = case platformOS platform of
+ OSMinGW32 -> case platformArch platform of
+ ArchX86 -> amount > (4 * 1024)
+ ArchX86_64 -> amount > (8 * 1024)
+ _ -> False
+ _ -> False
x86_mkStackAllocInstr
:: Platform
-> Int
- -> Instr
+ -> [Instr]
x86_mkStackAllocInstr platform amount
- = case platformArch platform of
- ArchX86 -> SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
- ArchX86_64 -> SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
- _ -> panic "x86_mkStackAllocInstr"
+ = case platformOS platform of
+ OSMinGW32 ->
+ -- These will clobber AX but this should be ok because
+ --
+ -- 1. It is the first thing we do when entering the closure and AX is
+ -- a caller saved registers on Windows both on x86_64 and x86.
+ --
+ -- 2. The closures are only entered via a call or longjmp in which case
+ -- there are no expectations for volatile registers.
+ --
+ -- 3. When the target is a local branch point it is re-targeted
+ -- after the dealloc, preserving #2. See note [extra spill slots].
+ --
+ -- We emit a call because the stack probes are quite involved and
+ -- would bloat code size a lot. GHC doesn't really have an -Os.
+ -- __chkstk is guaranteed to leave all nonvolatile registers and AX
+ -- untouched. It's part of the standard prologue code for any Windows
+ -- function dropping the stack more than a page.
+ -- See Note [Windows stack layout]
+ case platformArch platform of
+ ArchX86 | needs_probe_call platform amount ->
+ [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
+ , CALL (Left $ strImmLit "___chkstk_ms") [eax]
+ , SUB II32 (OpReg eax) (OpReg esp)
+ ]
+ | otherwise ->
+ [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
+ , TEST II32 (OpReg esp) (OpReg esp)
+ ]
+ ArchX86_64 | needs_probe_call platform amount ->
+ [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
+ , CALL (Left $ strImmLit "__chkstk_ms") [rax]
+ , SUB II64 (OpReg rax) (OpReg rsp)
+ ]
+ | otherwise ->
+ [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
+ , TEST II64 (OpReg rsp) (OpReg rsp)
+ ]
+ _ -> panic "x86_mkStackAllocInstr"
+ _ ->
+ case platformArch platform of
+ ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
+ ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
+ _ -> panic "x86_mkStackAllocInstr"
x86_mkStackDeallocInstr
:: Platform
-> Int
- -> Instr
+ -> [Instr]
x86_mkStackDeallocInstr platform amount
= case platformArch platform of
- ArchX86 -> ADD II32 (OpImm (ImmInt amount)) (OpReg esp)
- ArchX86_64 -> ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)
+ ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
+ ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
_ -> panic "x86_mkStackDeallocInstr"
i386_insert_ffrees
@@ -981,7 +1075,7 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
insert_stack_insns (BasicBlock id insns)
| Just new_blockid <- mapLookup id new_blockmap
- = [ BasicBlock id [alloc, JXX ALWAYS new_blockid]
+ = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
, BasicBlock new_blockid block' ]
| otherwise
= [ BasicBlock id block' ]
@@ -989,7 +1083,7 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
block' = foldr insert_dealloc [] insns
insert_dealloc insn r = case insn of
- JMP _ _ -> dealloc : insn : r
+ JMP _ _ -> dealloc ++ (insn : r)
JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
_other -> x86_patchJumpInstr insn retarget : r
where retarget b = fromMaybe b (mapLookup b new_blockmap)
@@ -998,7 +1092,6 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
-- in
return (CmmProc info lbl live (ListGraph new_code))
-
data JumpDest = DestBlockId BlockId | DestImm Imm
getJumpDestBlockId :: JumpDest -> Maybe BlockId
@@ -1015,14 +1108,24 @@ canShortcut _ = Nothing
-- The blockset helps avoid following cycles.
shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
- where shortcutJump' fn seen insn@(JXX cc id) =
- if setMember id seen then insn
- else case fn id of
- Nothing -> insn
- Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
- Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm)
- where seen' = setInsert id seen
- shortcutJump' _ _ other = other
+ where
+ shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
+ shortcutJump' fn seen insn@(JXX cc id) =
+ if setMember id seen then insn
+ else case fn id of
+ Nothing -> insn
+ Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
+ Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm)
+ where seen' = setInsert id seen
+ shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
+ let updateBlock (Just (DestBlockId bid)) =
+ case fn bid of
+ Nothing -> Just (DestBlockId bid )
+ Just dest -> Just dest
+ updateBlock dest = dest
+ blocks' = map updateBlock blocks
+ in JMP_TBL addr blocks' section tblId
+ shortcutJump' _ _ other = other
-- Here because it knows about JumpDest
shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, CmmStatics) -> (Alignment, CmmStatics)
@@ -1033,14 +1136,14 @@ shortcutStatics fn (align, Statics lbl statics)
shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
shortcutLabel fn lab
- | Just uq <- maybeAsmTemp lab = shortBlockId fn emptyUniqSet (mkBlockId uq)
- | otherwise = lab
+ | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
+ | otherwise = lab
shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
shortcutStatic fn (CmmStaticLit (CmmLabel lab))
= CmmStaticLit (CmmLabel (shortcutLabel fn lab))
-shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off))
- = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off)
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+ = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
-- slightly dodgy, we're ignoring the second label, but this
-- works with the way we use CmmLabelDiffOff for jump tables now.
shortcutStatic _ other_static
@@ -1054,8 +1157,8 @@ shortBlockId
shortBlockId fn seen blockid =
case (elementOfUniqSet uq seen, fn blockid) of
- (True, _) -> mkAsmTempLabel uq
- (_, Nothing) -> mkAsmTempLabel uq
+ (True, _) -> blockLbl blockid
+ (_, Nothing) -> blockLbl blockid
(_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
(_, Just (DestImm (ImmCLbl lbl))) -> lbl
(_, _other) -> panic "shortBlockId"
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index fce432a3dc..03d4fce794 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -23,6 +23,8 @@ where
#include "HsVersions.h"
#include "nativeGen/NCG.h"
+import GhcPrelude
+
import X86.Regs
import X86.Instr
import X86.Cond
@@ -37,8 +39,9 @@ import Hoopl.Label
import BasicTypes (Alignment)
import DynFlags
import Cmm hiding (topInfoTable)
+import BlockId
import CLabel
-import Unique ( pprUniqueAlways, Uniquable(..) )
+import Unique ( pprUniqueAlways )
import Platform
import FastString
import Outputable
@@ -70,12 +73,17 @@ import Data.Bits
-- .subsections_via_symbols and -dead_strip can be found at
-- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101>
+pprProcAlignment :: SDoc
+pprProcAlignment = sdocWithDynFlags $ \dflags ->
+ (maybe empty pprAlign . cmmProcAlignment $ dflags)
+
pprNatCmmDecl :: NatCmmDecl (Alignment, CmmStatics) Instr -> SDoc
pprNatCmmDecl (CmmData section dats) =
pprSectionAlign section $$ pprDatas dats
pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
sdocWithDynFlags $ \dflags ->
+ pprProcAlignment $$
case topInfoTable proc of
Nothing ->
case blocks of
@@ -83,6 +91,7 @@ pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
pprLabel lbl
blocks -> -- special case for code without info table:
pprSectionAlign (Section Text lbl) $$
+ pprProcAlignment $$
pprLabel lbl $$ -- blocks guaranteed not null, so label needed
vcat (map (pprBasicBlock top_info) blocks) $$
(if debugLevel dflags > 0
@@ -92,6 +101,7 @@ pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
Just (Statics info_lbl _) ->
sdocWithPlatform $ \platform ->
pprSectionAlign (Section Text info_lbl) $$
+ pprProcAlignment $$
(if platformHasSubsectionsViaSymbols platform
then ppr (mkDeadStripPreventer info_lbl) <> char ':'
else empty) $$
@@ -126,7 +136,7 @@ pprBasicBlock info_env (BasicBlock blockid instrs)
(if debugLevel dflags > 0
then ppr (mkAsmTempEndLabel asmLbl) <> char ':' else empty)
where
- asmLbl = mkAsmTempLabel (getUnique blockid)
+ asmLbl = blockLbl blockid
maybe_infotable = case mapLookup blockid info_env of
Nothing -> empty
Just (Statics info_lbl info) ->
@@ -160,35 +170,116 @@ pprGloblDecl lbl
| not (externallyVisibleCLabel lbl) = empty
| otherwise = text ".globl " <> ppr lbl
-pprTypeAndSizeDecl :: CLabel -> SDoc
-pprTypeAndSizeDecl lbl
+pprLabelType' :: DynFlags -> CLabel -> SDoc
+pprLabelType' dflags lbl =
+ if isCFunctionLabel lbl || functionOkInfoTable then
+ text "@function"
+ else
+ text "@object"
+ where
+ {-
+ NOTE: This is a bit hacky.
+
+ With the `tablesNextToCode` info tables look like this:
+ ```
+ <info table data>
+ label_info:
+ <info table code>
+ ```
+ So actually info table label points exactly to the code and we can mark
+ the label as @function. (This is required to make perf and potentially other
+ tools to work on Haskell binaries).
+ This usually works well but it can cause issues with a linker.
+ A linker uses different algorithms for the relocation depending on
+ the symbol type.For some reason, a linker will generate JUMP_SLOT relocation
+ when constructor info table is referenced from a data section.
+ This only happens with static constructor call so
+ we mark _con_info symbols as `@object` to avoid the issue with relocations.
+
+ @SimonMarlow hack explanation:
+ "The reasoning goes like this:
+
+ * The danger when we mark a symbol as `@function` is that the linker will
+ redirect it to point to the PLT and use a `JUMP_SLOT` relocation when
+ the symbol refers to something outside the current shared object.
+ A PLT / JUMP_SLOT reference only works for symbols that we jump to, not
+ for symbols representing data,, nor for info table symbol references which
+ we expect to point directly to the info table.
+ * GHC generates code that might refer to any info table symbol from the text
+ segment, but that's OK, because those will be explicit GOT references
+ generated by the code generator.
+ * When we refer to info tables from the data segment, it's either
+ * a FUN_STATIC/THUNK_STATIC local to this module
+ * a `con_info` that could be from anywhere
+
+ So, the only info table symbols that we might refer to from the data segment
+ of another shared object are `con_info` symbols, so those are the ones we
+ need to exclude from getting the @function treatment.
+ "
+
+ A good place to check for more
+ https://ghc.haskell.org/trac/ghc/wiki/Commentary/PositionIndependentCode
+
+ Another possible hack is to create an extra local function symbol for
+ every code-like thing to give the needed information for to the tools
+ but mess up with the relocation. https://phabricator.haskell.org/D4730
+ -}
+ functionOkInfoTable = tablesNextToCode dflags &&
+ isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+
+pprTypeDecl :: CLabel -> SDoc
+pprTypeDecl lbl
= sdocWithPlatform $ \platform ->
if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
- then text ".type " <> ppr lbl <> ptext (sLit ", @object")
+ then
+ sdocWithDynFlags $ \df ->
+ text ".type " <> ppr lbl <> ptext (sLit ", ") <> pprLabelType' df lbl
else empty
pprLabel :: CLabel -> SDoc
pprLabel lbl = pprGloblDecl lbl
- $$ pprTypeAndSizeDecl lbl
+ $$ pprTypeDecl lbl
$$ (ppr lbl <> char ':')
+{-
+Note [Pretty print ASCII when AsmCodeGen]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Previously, when generating assembly code, we created SDoc with
+`(ptext . sLit)` for every bytes in literal bytestring, then
+combine them using `hcat`.
+
+When handling literal bytestrings with millions of bytes,
+millions of SDoc would be created and to combine, leading to
+high memory usage.
+
+Now we escape the given bytestring to string directly and construct
+SDoc only once. This improvement could dramatically decrease the
+memory allocation from 4.7GB to 1.3GB when embedding a 3MB literal
+string in source code. See Trac #14741 for profiling results.
+-}
pprASCII :: [Word8] -> SDoc
pprASCII str
- = hcat (map (do1 . fromIntegral) str)
+ -- Transform this given literal bytestring to escaped string and construct
+ -- the literal SDoc directly.
+ -- See Trac #14741
+ -- and Note [Pretty print ASCII when AsmCodeGen]
+ = ptext $ sLit $ foldr (\w s -> (do1 . fromIntegral) w ++ s) "" str
where
- do1 :: Int -> SDoc
- do1 w | '\t' <- chr w = ptext (sLit "\\t")
- do1 w | '\n' <- chr w = ptext (sLit "\\n")
- do1 w | '"' <- chr w = ptext (sLit "\\\"")
- do1 w | '\\' <- chr w = ptext (sLit "\\\\")
- do1 w | isPrint (chr w) = char (chr w)
- do1 w | otherwise = char '\\' <> octal w
-
- octal :: Int -> SDoc
- octal w = int ((w `div` 64) `mod` 8)
- <> int ((w `div` 8) `mod` 8)
- <> int (w `mod` 8)
+ do1 :: Int -> String
+ do1 w | '\t' <- chr w = "\\t"
+ | '\n' <- chr w = "\\n"
+ | '"' <- chr w = "\\\""
+ | '\\' <- chr w = "\\\\"
+ | isPrint (chr w) = [chr w]
+ | otherwise = '\\' : octal w
+
+ octal :: Int -> String
+ octal w = [ chr (ord '0' + (w `div` 64) `mod` 8)
+ , chr (ord '0' + (w `div` 8) `mod` 8)
+ , chr (ord '0' + w `mod` 8)
+ ]
pprAlign :: Int -> SDoc
pprAlign bytes
@@ -505,7 +596,7 @@ pprDataItem' dflags lit
--
case lit of
-- A relative relocation:
- CmmLabelDiffOff _ _ _ ->
+ CmmLabelDiffOff _ _ _ _ ->
[text "\t.long\t" <> pprImm imm,
text "\t.long\t0"]
_ ->
@@ -516,7 +607,7 @@ pprDataItem' dflags lit
asmComment :: SDoc -> SDoc
-asmComment c = ifPprDebug $ text "# " <> c
+asmComment c = whenPprDebug $ text "# " <> c
pprInstr :: Instr -> SDoc
@@ -645,6 +736,9 @@ pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst
pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst)
pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst)
+pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst
+pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst
+
pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
@@ -702,7 +796,7 @@ pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
pprInstr (JXX cond blockid)
= pprCondInstr (sLit "j") cond (ppr lab)
- where lab = mkAsmTempLabel (getUnique blockid)
+ where lab = blockLbl blockid
pprInstr (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm)
@@ -1259,6 +1353,16 @@ pprFormatRegRegReg name format reg1 reg2 reg3
pprReg format reg3
]
+pprFormatOpOpReg :: LitString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprFormatOpOpReg name format op1 op2 reg3
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2,
+ comma,
+ pprReg format reg3
+ ]
pprFormatAddrReg :: LitString -> Format -> AddrMode -> Reg -> SDoc
pprFormatAddrReg name format op dst
@@ -1302,4 +1406,3 @@ pprFormatOpOpCoerce name format1 format2 op1 op2
pprCondInstr :: LitString -> Cond -> SDoc -> SDoc
pprCondInstr name cond arg
= hcat [ char '\t', ptext name, pprCond cond, space, arg]
-
diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs
index 4dfe0350d4..226441b16f 100644
--- a/compiler/nativeGen/X86/RegInfo.hs
+++ b/compiler/nativeGen/X86/RegInfo.hs
@@ -9,6 +9,8 @@ where
#include "nativeGen/NCG.h"
#include "HsVersions.h"
+import GhcPrelude
+
import Format
import Reg
diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs
index 4cb82ea224..97c3b984e2 100644
--- a/compiler/nativeGen/X86/Regs.hs
+++ b/compiler/nativeGen/X86/Regs.hs
@@ -48,6 +48,8 @@ where
#include "nativeGen/NCG.h"
#include "HsVersions.h"
+import GhcPrelude
+
import CodeGen.Platform
import Reg
import RegClass
@@ -58,8 +60,10 @@ import DynFlags
import Outputable
import Platform
+import qualified Data.Array as A
+
-- | regSqueeze_class reg
--- Calculuate the maximum number of register colors that could be
+-- Calculate the maximum number of register colors that could be
-- denied to a node of this class due to having this reg
-- as a neighbour.
--
@@ -142,7 +146,7 @@ litToImm (CmmFloat f W32) = ImmFloat f
litToImm (CmmFloat f W64) = ImmDouble f
litToImm (CmmLabel l) = ImmCLbl l
litToImm (CmmLabelOff l off) = ImmIndex l off
-litToImm (CmmLabelDiffOff l1 l2 off)
+litToImm (CmmLabelDiffOff l1 l2 off _)
= ImmConstantSum
(ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
(ImmInt off)
@@ -234,7 +238,6 @@ xmmregnos platform = [firstxmm .. lastxmm platform]
floatregnos :: Platform -> [RegNo]
floatregnos platform = fakeregnos ++ xmmregnos platform
-
-- argRegs is the set of regs which are read for an n-argument call to C.
-- For archs which pass all args on the stack (x86), is empty.
-- Sparc passes up to the first 6 args in regs.
@@ -267,13 +270,13 @@ showReg platform n
| n >= firstxmm = "%xmm" ++ show (n-firstxmm)
| n >= firstfake = "%fake" ++ show (n-firstfake)
| n >= 8 = "%r" ++ show n
- | otherwise = regNames platform !! n
+ | otherwise = regNames platform A.! n
-regNames :: Platform -> [String]
+regNames :: Platform -> A.Array Int String
regNames platform
= if target32Bit platform
- then ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
- else ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"]
+ then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
+ else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"]
@@ -404,7 +407,10 @@ callClobberedRegs platform
| target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform)
| platformOS platform == OSMinGW32
= [rax,rcx,rdx,r8,r9,r10,r11]
- ++ map regSingle (floatregnos platform)
+ -- Only xmm0-5 are caller-saves registers on 64bit windows.
+ -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage )
+ -- For details check the Win64 ABI.
+ ++ map regSingle fakeregnos ++ map xmm [0 .. 5]
| otherwise
-- all xmm regs are caller-saves
-- caller-saves registers