diff options
Diffstat (limited to 'compiler/nativeGen/X86')
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 247 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Cond.hs | 2 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 161 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 149 | ||||
-rw-r--r-- | compiler/nativeGen/X86/RegInfo.hs | 2 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Regs.hs | 22 |
6 files changed, 488 insertions, 95 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 341fa43dbc..a2e26bd68b 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -2,9 +2,7 @@ -- The default iteration limit is a bit too low for the definitions -- in this module. -#if __GLASGOW_HASKELL__ >= 800 {-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-} -#endif ----------------------------------------------------------------------------- -- @@ -32,6 +30,8 @@ where #include "../includes/MachDeps.h" -- NCG stuff: +import GhcPrelude + import X86.Instr import X86.Cond import X86.Regs @@ -65,7 +65,6 @@ import SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol ) import ForeignCall ( CCallConv(..) ) import OrdList import Outputable -import Unique import FastString import DynFlags import Util @@ -211,6 +210,9 @@ stmtToInstrs stmt = do -> genCCall dflags is32Bit target result_regs args CmmBranch id -> genBranch id + + --We try to arrange blocks such that the likely branch is the fallthrough + --in CmmContFlowOpt. So we can assume the condition is likely false here. CmmCondBranch arg true false _ -> do b1 <- genCondJump true arg b2 <- genBranch false @@ -295,7 +297,7 @@ data Amode = Amode AddrMode InstrBlock {- -Now, given a tree (the argument to an CmmLoad) that references memory, +Now, given a tree (the argument to a CmmLoad) that references memory, produce a suitable addressing mode. A Rule of the Game (tm) for Amodes: use of the addr bit must @@ -328,7 +330,7 @@ is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000 jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags)) jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel) - where blockLabel = mkAsmTempLabel (getUnique blockid) + where blockLabel = blockLbl blockid -- ----------------------------------------------------------------------------- @@ -466,6 +468,20 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do r_dst_lo ) +iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do + fn <- getAnyReg expr + r_dst_lo <- getNewRegNat II32 + let r_dst_hi = getHiVRegFromLo r_dst_lo + code = fn r_dst_lo + return ( + ChildCode64 (code `snocOL` + MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL` + CLTD II32 `snocOL` + MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL` + MOV II32 (OpReg edx) (OpReg r_dst_hi)) + r_dst_lo + ) + iselExpr64 expr = pprPanic "iselExpr64(i386)" (ppr expr) @@ -503,6 +519,9 @@ getRegister' dflags is32Bit (CmmReg reg) getRegister' dflags is32Bit (CmmRegOff r n) = getRegister' dflags is32Bit $ mangleIndexTree dflags r n +getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e]) + = addAlignmentCheck align <$> getRegister' dflags is32Bit e + -- for 32-bit architectuers, support some 64 -> 32 bit conversions: -- TO_W_(x), TO_W_(x >> 32) @@ -731,8 +750,10 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps MO_F_Ne _ -> condFltReg is32Bit NE x y MO_F_Gt _ -> condFltReg is32Bit GTT x y MO_F_Ge _ -> condFltReg is32Bit GE x y - MO_F_Lt _ -> condFltReg is32Bit LTT x y - MO_F_Le _ -> condFltReg is32Bit LE x y + -- Invert comparison condition and swap operands + -- See Note [SSE Parity Checks] + MO_F_Lt _ -> condFltReg is32Bit GTT y x + MO_F_Le _ -> condFltReg is32Bit GE y x MO_Eq _ -> condIntReg EQQ x y MO_Ne _ -> condIntReg NE x y @@ -1255,6 +1276,21 @@ isOperand is32Bit (CmmLit lit) = is32BitLit is32Bit lit || isSuitableFloatingPointLit lit isOperand _ _ = False +-- | Given a 'Register', produce a new 'Register' with an instruction block +-- which will check the value for alignment. Used for @-falignment-sanitisation@. +addAlignmentCheck :: Int -> Register -> Register +addAlignmentCheck align reg = + case reg of + Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg) + Any fmt f -> Any fmt (\reg -> f reg `appOL` check fmt reg) + where + check :: Format -> Reg -> InstrBlock + check fmt reg = + ASSERT(not $ isFloatFormat fmt) + toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg) + , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel + ] + memConstant :: Int -> CmmLit -> NatM Amode memConstant align lit = do lbl <- getNewLabelNat @@ -1331,15 +1367,17 @@ getCondCode (CmmMachOp mop [x, y]) MO_F_Ne W32 -> condFltCode NE x y MO_F_Gt W32 -> condFltCode GTT x y MO_F_Ge W32 -> condFltCode GE x y - MO_F_Lt W32 -> condFltCode LTT x y - MO_F_Le W32 -> condFltCode LE x y + -- Invert comparison condition and swap operands + -- See Note [SSE Parity Checks] + MO_F_Lt W32 -> condFltCode GTT y x + MO_F_Le W32 -> condFltCode GE y x MO_F_Eq W64 -> condFltCode EQQ x y MO_F_Ne W64 -> condFltCode NE x y MO_F_Gt W64 -> condFltCode GTT x y MO_F_Ge W64 -> condFltCode GE x y - MO_F_Lt W64 -> condFltCode LTT x y - MO_F_Le W64 -> condFltCode LE x y + MO_F_Lt W64 -> condFltCode GTT y x + MO_F_Le W64 -> condFltCode GE y x _ -> condIntCode (machOpToCond mop) x y @@ -1639,11 +1677,19 @@ genCondJump' _ id bool = do else do lbl <- getBlockIdNat - -- see comment with condFltReg + -- See Note [SSE Parity Checks] let code = case cond of NE -> or_unordered GU -> plain_test GEU -> plain_test + -- Use ASSERT so we don't break releases if + -- LTT/LE creep in somehow. + LTT -> + ASSERT2(False, ppr "Should have been turned into >") + and_ordered + LE -> + ASSERT2(False, ppr "Should have been turned into >=") + and_ordered _ -> and_ordered plain_test = unitOL ( @@ -1855,6 +1901,72 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] format = intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width)) +genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst] + args@[src, mask] = do + let platform = targetPlatform dflags + if isBmi2Enabled dflags + then do code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + let dst_r = getRegisterReg platform False (CmmLocal dst) + return $ code_src src_r `appOL` code_mask mask_r `appOL` + (if width == W8 then + -- The PDEP instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` + unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` + unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r) + else + unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + else do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall dflags is32Bit target dest_regs args + where + format = intFormat width + lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width)) + +genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst] + args@[src, mask] = do + let platform = targetPlatform dflags + if isBmi2Enabled dflags + then do code_src <- getAnyReg src + code_mask <- getAnyReg mask + src_r <- getNewRegNat format + mask_r <- getNewRegNat format + let dst_r = getRegisterReg platform False (CmmLocal dst) + return $ code_src src_r `appOL` code_mask mask_r `appOL` + (if width == W8 then + -- The PEXT instruction doesn't take a r/m8 + unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL` + unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL` + unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r) + else + unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL` + (if width == W8 || width == W16 then + -- We used a 16-bit destination register above, + -- so zero-extend + unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r)) + else nilOL) + else do + targetExpr <- cmmMakeDynamicReference dflags + CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall dflags is32Bit target dest_regs args + where + format = intFormat width + lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width)) + genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] | is32Bit && width == W64 = do -- Fallback to `hs_clz64` on i386 @@ -2129,6 +2241,8 @@ genCCall _ is32Bit target dest_regs args = do ADC format (OpImm (ImmInteger 0)) (OpReg reg_h) return code _ -> panic "genCCall: Wrong number of arguments/results for add2" + (PrimTarget (MO_AddWordC width), [res_r, res_c]) -> + addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args (PrimTarget (MO_SubWordC width), [res_r, res_c]) -> addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args (PrimTarget (MO_AddIntC width), [res_r, res_c]) -> @@ -2645,6 +2759,10 @@ outOfLineCmmOp mop res args MO_F32_Tanh -> fsLit "tanhf" MO_F32_Pwr -> fsLit "powf" + MO_F32_Asinh -> fsLit "asinhf" + MO_F32_Acosh -> fsLit "acoshf" + MO_F32_Atanh -> fsLit "atanhf" + MO_F64_Sqrt -> fsLit "sqrt" MO_F64_Fabs -> fsLit "fabs" MO_F64_Sin -> fsLit "sin" @@ -2662,15 +2780,23 @@ outOfLineCmmOp mop res args MO_F64_Tanh -> fsLit "tanh" MO_F64_Pwr -> fsLit "pow" + MO_F64_Asinh -> fsLit "asinh" + MO_F64_Acosh -> fsLit "acosh" + MO_F64_Atanh -> fsLit "atanh" + MO_Memcpy _ -> fsLit "memcpy" MO_Memset _ -> fsLit "memset" MO_Memmove _ -> fsLit "memmove" + MO_Memcmp _ -> fsLit "memcmp" MO_PopCnt _ -> fsLit "popcnt" MO_BSwap _ -> fsLit "bswap" MO_Clz w -> fsLit $ clzLabel w MO_Ctz _ -> unsupported + MO_Pdep w -> fsLit $ pdepLabel w + MO_Pext w -> fsLit $ pextLabel w + MO_AtomicRMW _ _ -> fsLit "atomicrmw" MO_AtomicRead _ -> fsLit "atomicread" MO_AtomicWrite _ -> fsLit "atomicwrite" @@ -2684,6 +2810,7 @@ outOfLineCmmOp mop res args MO_Add2 {} -> unsupported MO_AddIntC {} -> unsupported MO_SubIntC {} -> unsupported + MO_AddWordC {} -> unsupported MO_SubWordC {} -> unsupported MO_U_Mul2 {} -> unsupported MO_WriteBarrier -> unsupported @@ -2698,7 +2825,7 @@ outOfLineCmmOp mop res args genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock genSwitch dflags expr targets - | gopt Opt_PIC dflags + | positionIndependent dflags = do (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset) -- getNonClobberedReg because it needs to survive across t_code @@ -2750,23 +2877,29 @@ genSwitch dflags expr targets JMP_TBL op ids (Section ReadOnlyData lbl) lbl ] return code - where (offset, ids) = switchTargetsToTable targets + where + (offset, blockIds) = switchTargetsToTable targets + ids = map (fmap DestBlockId) blockIds generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, CmmStatics) Instr) generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl) - = Just (createJumpTable dflags ids section lbl) + = let getBlockId (DestBlockId id) = id + getBlockId _ = panic "Non-Label target in Jump Table" + blockIds = map (fmap getBlockId) ids + in Just (createJumpTable dflags blockIds section lbl) generateJumpTableForInstr _ _ = Nothing createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel -> GenCmmDecl (Alignment, CmmStatics) h g createJumpTable dflags ids section lbl = let jumpTable - | gopt Opt_PIC dflags = - let jumpTableEntryRel Nothing - = CmmStaticLit (CmmInt 0 (wordWidth dflags)) + | positionIndependent dflags = + let ww = wordWidth dflags + jumpTableEntryRel Nothing + = CmmStaticLit (CmmInt 0 ww) jumpTableEntryRel (Just blockid) - = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0) - where blockLabel = mkAsmTempLabel (getUnique blockid) + = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww) + where blockLabel = blockLbl blockid in map jumpTableEntryRel ids | otherwise = map (jumpTableEntry dflags) ids in CmmData section (1, Statics lbl jumpTable) @@ -2797,6 +2930,59 @@ condIntReg cond x y = do return (Any II32 code) +----------------------------------------------------------- +--- Note [SSE Parity Checks] --- +----------------------------------------------------------- + +-- We have to worry about unordered operands (eg. comparisons +-- against NaN). If the operands are unordered, the comparison +-- sets the parity flag, carry flag and zero flag. +-- All comparisons are supposed to return false for unordered +-- operands except for !=, which returns true. +-- +-- Optimisation: we don't have to test the parity flag if we +-- know the test has already excluded the unordered case: eg > +-- and >= test for a zero carry flag, which can only occur for +-- ordered operands. +-- +-- By reversing comparisons we can avoid testing the parity +-- for < and <= as well. If any of the arguments is an NaN we +-- return false either way. If both arguments are valid then +-- x <= y <-> y >= x holds. So it's safe to swap these. +-- +-- We invert the condition inside getRegister'and getCondCode +-- which should cover all invertable cases. +-- All other functions translating FP comparisons to assembly +-- use these to two generate the comparison code. +-- +-- As an example consider a simple check: +-- +-- func :: Float -> Float -> Int +-- func x y = if x < y then 1 else 0 +-- +-- Which in Cmm gives the floating point comparison. +-- +-- if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf; +-- +-- We used to compile this to an assembly code block like this: +-- _c2gh: +-- ucomiss %xmm2,%xmm1 +-- jp _c2gf +-- jb _c2gg +-- jmp _c2gf +-- +-- Where we have to introduce an explicit +-- check for unordered results (using jmp parity): +-- +-- We can avoid this by exchanging the arguments and inverting the direction +-- of the comparison. This results in the sequence of: +-- +-- ucomiss %xmm1,%xmm2 +-- ja _c2g2 +-- jmp _c2g1 +-- +-- Removing the jump reduces the pressure on the branch predidiction system +-- and plays better with the uOP cache. condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87 @@ -2815,27 +3001,18 @@ condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87 CondCode _ cond cond_code <- condFltCode cond x y tmp1 <- getNewRegNat (archWordFormat is32Bit) tmp2 <- getNewRegNat (archWordFormat is32Bit) - let - -- We have to worry about unordered operands (eg. comparisons - -- against NaN). If the operands are unordered, the comparison - -- sets the parity flag, carry flag and zero flag. - -- All comparisons are supposed to return false for unordered - -- operands except for !=, which returns true. - -- - -- Optimisation: we don't have to test the parity flag if we - -- know the test has already excluded the unordered case: eg > - -- and >= test for a zero carry flag, which can only occur for - -- ordered operands. - -- - -- ToDo: by reversing comparisons we could avoid testing the - -- parity flag in more cases. - + let -- See Note [SSE Parity Checks] code dst = cond_code `appOL` (case cond of NE -> or_unordered dst GU -> plain_test dst GEU -> plain_test dst + -- Use ASSERT so we don't break releases if these creep in. + LTT -> ASSERT2(False, ppr "Should have been turned into >") + and_ordered dst + LE -> ASSERT2(False, ppr "Should have been turned into >=") + and_ordered dst _ -> and_ordered dst) plain_test dst = toOL [ diff --git a/compiler/nativeGen/X86/Cond.hs b/compiler/nativeGen/X86/Cond.hs index 586dabd8f4..35cbf943e1 100644 --- a/compiler/nativeGen/X86/Cond.hs +++ b/compiler/nativeGen/X86/Cond.hs @@ -8,6 +8,8 @@ module X86.Cond ( where +import GhcPrelude + data Cond = ALWAYS -- What's really used? ToDo | EQQ diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index 71f50e9d2a..c7000c9f4b 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -8,7 +8,7 @@ -- ----------------------------------------------------------------------------- -module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest, +module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..), getJumpDestBlockId, canShortcut, shortcutStatics, shortcutJump, i386_insert_ffrees, allocMoreStack, maxSpillSlots, archWordFormat) @@ -17,6 +17,8 @@ where #include "HsVersions.h" #include "nativeGen/NCG.h" +import GhcPrelude + import X86.Cond import X86.Regs import Instruction @@ -320,7 +322,7 @@ data Instr | JXX_GBL Cond Imm -- non-local version of JXX -- Table jump | JMP_TBL Operand -- Address to jump to - [Maybe BlockId] -- Blocks in the jump table + [Maybe JumpDest] -- Targets of the jump table Section -- Data section jump table should be put in CLabel -- Label of jump table | CALL (Either Imm Reg) [Reg] @@ -343,6 +345,10 @@ data Instr | BSF Format Operand Reg -- bit scan forward | BSR Format Operand Reg -- bit scan reverse + -- bit manipulation instructions + | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask + | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask + -- prefetch | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch -- variant can be NTA, Lvl0, Lvl1, or Lvl2 @@ -462,6 +468,9 @@ x86_regUsageOfInstr platform instr BSF _ src dst -> mkRU (use_R src []) [dst] BSR _ src dst -> mkRU (use_R src []) [dst] + PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] + PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] + -- note: might be a better way to do this PREFETCH _ _ src -> mkRU (use_R src []) [] LOCK i -> x86_regUsageOfInstr platform i @@ -638,6 +647,8 @@ x86_patchRegsOfInstr instr env CLTD _ -> instr POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst) + PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst) + PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst) BSF fmt src dst -> BSF fmt (patchOp src) (env dst) BSR fmt src dst -> BSR fmt (patchOp src) (env dst) @@ -693,7 +704,7 @@ x86_jumpDestsOfInstr x86_jumpDestsOfInstr insn = case insn of JXX _ id -> [id] - JMP_TBL _ ids _ _ -> [id | Just id <- ids] + JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids] _ -> [] @@ -704,8 +715,12 @@ x86_patchJumpInstr insn patchF = case insn of JXX cc id -> JXX cc (patchF id) JMP_TBL op ids section lbl - -> JMP_TBL op (map (fmap patchF) ids) section lbl + -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl _ -> insn + where + patchJumpDest f (DestBlockId id) = DestBlockId (f id) + patchJumpDest _ dest = dest + @@ -843,25 +858,104 @@ x86_mkJumpInstr x86_mkJumpInstr id = [JXX ALWAYS id] +-- Note [Windows stack layout] +-- | On most OSes the kernel will place a guard page after the current stack +-- page. If you allocate larger than a page worth you may jump over this +-- guard page. Not only is this a security issue, but on certain OSes such +-- as Windows a new page won't be allocated if you don't hit the guard. This +-- will cause a segfault or access fault. +-- +-- This function defines if the current allocation amount requires a probe. +-- On Windows (for now) we emit a call to _chkstk for this. For other OSes +-- this is not yet implemented. +-- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk +-- The Windows stack looks like this: +-- +-- +-------------------+ +-- | SP | +-- +-------------------+ +-- | | +-- | GUARD PAGE | +-- | | +-- +-------------------+ +-- | | +-- | | +-- | UNMAPPED | +-- | | +-- | | +-- +-------------------+ +-- +-- In essense each allocation larger than a page size needs to be chunked and +-- a probe emitted after each page allocation. You have to hit the guard +-- page so the kernel can map in the next page, otherwise you'll segfault. +-- +needs_probe_call :: Platform -> Int -> Bool +needs_probe_call platform amount + = case platformOS platform of + OSMinGW32 -> case platformArch platform of + ArchX86 -> amount > (4 * 1024) + ArchX86_64 -> amount > (8 * 1024) + _ -> False + _ -> False x86_mkStackAllocInstr :: Platform -> Int - -> Instr + -> [Instr] x86_mkStackAllocInstr platform amount - = case platformArch platform of - ArchX86 -> SUB II32 (OpImm (ImmInt amount)) (OpReg esp) - ArchX86_64 -> SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) - _ -> panic "x86_mkStackAllocInstr" + = case platformOS platform of + OSMinGW32 -> + -- These will clobber AX but this should be ok because + -- + -- 1. It is the first thing we do when entering the closure and AX is + -- a caller saved registers on Windows both on x86_64 and x86. + -- + -- 2. The closures are only entered via a call or longjmp in which case + -- there are no expectations for volatile registers. + -- + -- 3. When the target is a local branch point it is re-targeted + -- after the dealloc, preserving #2. See note [extra spill slots]. + -- + -- We emit a call because the stack probes are quite involved and + -- would bloat code size a lot. GHC doesn't really have an -Os. + -- __chkstk is guaranteed to leave all nonvolatile registers and AX + -- untouched. It's part of the standard prologue code for any Windows + -- function dropping the stack more than a page. + -- See Note [Windows stack layout] + case platformArch platform of + ArchX86 | needs_probe_call platform amount -> + [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax) + , CALL (Left $ strImmLit "___chkstk_ms") [eax] + , SUB II32 (OpReg eax) (OpReg esp) + ] + | otherwise -> + [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) + , TEST II32 (OpReg esp) (OpReg esp) + ] + ArchX86_64 | needs_probe_call platform amount -> + [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax) + , CALL (Left $ strImmLit "__chkstk_ms") [rax] + , SUB II64 (OpReg rax) (OpReg rsp) + ] + | otherwise -> + [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) + , TEST II64 (OpReg rsp) (OpReg rsp) + ] + _ -> panic "x86_mkStackAllocInstr" + _ -> + case platformArch platform of + ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ] + ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ] + _ -> panic "x86_mkStackAllocInstr" x86_mkStackDeallocInstr :: Platform -> Int - -> Instr + -> [Instr] x86_mkStackDeallocInstr platform amount = case platformArch platform of - ArchX86 -> ADD II32 (OpImm (ImmInt amount)) (OpReg esp) - ArchX86_64 -> ADD II64 (OpImm (ImmInt amount)) (OpReg rsp) + ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)] + ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)] _ -> panic "x86_mkStackDeallocInstr" i386_insert_ffrees @@ -981,7 +1075,7 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do insert_stack_insns (BasicBlock id insns) | Just new_blockid <- mapLookup id new_blockmap - = [ BasicBlock id [alloc, JXX ALWAYS new_blockid] + = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid] , BasicBlock new_blockid block' ] | otherwise = [ BasicBlock id block' ] @@ -989,7 +1083,7 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do block' = foldr insert_dealloc [] insns insert_dealloc insn r = case insn of - JMP _ _ -> dealloc : insn : r + JMP _ _ -> dealloc ++ (insn : r) JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL" _other -> x86_patchJumpInstr insn retarget : r where retarget b = fromMaybe b (mapLookup b new_blockmap) @@ -998,7 +1092,6 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do -- in return (CmmProc info lbl live (ListGraph new_code)) - data JumpDest = DestBlockId BlockId | DestImm Imm getJumpDestBlockId :: JumpDest -> Maybe BlockId @@ -1015,14 +1108,24 @@ canShortcut _ = Nothing -- The blockset helps avoid following cycles. shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn - where shortcutJump' fn seen insn@(JXX cc id) = - if setMember id seen then insn - else case fn id of - Nothing -> insn - Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id') - Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm) - where seen' = setInsert id seen - shortcutJump' _ _ other = other + where + shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr + shortcutJump' fn seen insn@(JXX cc id) = + if setMember id seen then insn + else case fn id of + Nothing -> insn + Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id') + Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm) + where seen' = setInsert id seen + shortcutJump' fn _ (JMP_TBL addr blocks section tblId) = + let updateBlock (Just (DestBlockId bid)) = + case fn bid of + Nothing -> Just (DestBlockId bid ) + Just dest -> Just dest + updateBlock dest = dest + blocks' = map updateBlock blocks + in JMP_TBL addr blocks' section tblId + shortcutJump' _ _ other = other -- Here because it knows about JumpDest shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, CmmStatics) -> (Alignment, CmmStatics) @@ -1033,14 +1136,14 @@ shortcutStatics fn (align, Statics lbl statics) shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel shortcutLabel fn lab - | Just uq <- maybeAsmTemp lab = shortBlockId fn emptyUniqSet (mkBlockId uq) - | otherwise = lab + | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId + | otherwise = lab shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic shortcutStatic fn (CmmStaticLit (CmmLabel lab)) = CmmStaticLit (CmmLabel (shortcutLabel fn lab)) -shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off)) - = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off) +shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w)) + = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w) -- slightly dodgy, we're ignoring the second label, but this -- works with the way we use CmmLabelDiffOff for jump tables now. shortcutStatic _ other_static @@ -1054,8 +1157,8 @@ shortBlockId shortBlockId fn seen blockid = case (elementOfUniqSet uq seen, fn blockid) of - (True, _) -> mkAsmTempLabel uq - (_, Nothing) -> mkAsmTempLabel uq + (True, _) -> blockLbl blockid + (_, Nothing) -> blockLbl blockid (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid' (_, Just (DestImm (ImmCLbl lbl))) -> lbl (_, _other) -> panic "shortBlockId" diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index fce432a3dc..03d4fce794 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -23,6 +23,8 @@ where #include "HsVersions.h" #include "nativeGen/NCG.h" +import GhcPrelude + import X86.Regs import X86.Instr import X86.Cond @@ -37,8 +39,9 @@ import Hoopl.Label import BasicTypes (Alignment) import DynFlags import Cmm hiding (topInfoTable) +import BlockId import CLabel -import Unique ( pprUniqueAlways, Uniquable(..) ) +import Unique ( pprUniqueAlways ) import Platform import FastString import Outputable @@ -70,12 +73,17 @@ import Data.Bits -- .subsections_via_symbols and -dead_strip can be found at -- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101> +pprProcAlignment :: SDoc +pprProcAlignment = sdocWithDynFlags $ \dflags -> + (maybe empty pprAlign . cmmProcAlignment $ dflags) + pprNatCmmDecl :: NatCmmDecl (Alignment, CmmStatics) Instr -> SDoc pprNatCmmDecl (CmmData section dats) = pprSectionAlign section $$ pprDatas dats pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = sdocWithDynFlags $ \dflags -> + pprProcAlignment $$ case topInfoTable proc of Nothing -> case blocks of @@ -83,6 +91,7 @@ pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = pprLabel lbl blocks -> -- special case for code without info table: pprSectionAlign (Section Text lbl) $$ + pprProcAlignment $$ pprLabel lbl $$ -- blocks guaranteed not null, so label needed vcat (map (pprBasicBlock top_info) blocks) $$ (if debugLevel dflags > 0 @@ -92,6 +101,7 @@ pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) = Just (Statics info_lbl _) -> sdocWithPlatform $ \platform -> pprSectionAlign (Section Text info_lbl) $$ + pprProcAlignment $$ (if platformHasSubsectionsViaSymbols platform then ppr (mkDeadStripPreventer info_lbl) <> char ':' else empty) $$ @@ -126,7 +136,7 @@ pprBasicBlock info_env (BasicBlock blockid instrs) (if debugLevel dflags > 0 then ppr (mkAsmTempEndLabel asmLbl) <> char ':' else empty) where - asmLbl = mkAsmTempLabel (getUnique blockid) + asmLbl = blockLbl blockid maybe_infotable = case mapLookup blockid info_env of Nothing -> empty Just (Statics info_lbl info) -> @@ -160,35 +170,116 @@ pprGloblDecl lbl | not (externallyVisibleCLabel lbl) = empty | otherwise = text ".globl " <> ppr lbl -pprTypeAndSizeDecl :: CLabel -> SDoc -pprTypeAndSizeDecl lbl +pprLabelType' :: DynFlags -> CLabel -> SDoc +pprLabelType' dflags lbl = + if isCFunctionLabel lbl || functionOkInfoTable then + text "@function" + else + text "@object" + where + {- + NOTE: This is a bit hacky. + + With the `tablesNextToCode` info tables look like this: + ``` + <info table data> + label_info: + <info table code> + ``` + So actually info table label points exactly to the code and we can mark + the label as @function. (This is required to make perf and potentially other + tools to work on Haskell binaries). + This usually works well but it can cause issues with a linker. + A linker uses different algorithms for the relocation depending on + the symbol type.For some reason, a linker will generate JUMP_SLOT relocation + when constructor info table is referenced from a data section. + This only happens with static constructor call so + we mark _con_info symbols as `@object` to avoid the issue with relocations. + + @SimonMarlow hack explanation: + "The reasoning goes like this: + + * The danger when we mark a symbol as `@function` is that the linker will + redirect it to point to the PLT and use a `JUMP_SLOT` relocation when + the symbol refers to something outside the current shared object. + A PLT / JUMP_SLOT reference only works for symbols that we jump to, not + for symbols representing data,, nor for info table symbol references which + we expect to point directly to the info table. + * GHC generates code that might refer to any info table symbol from the text + segment, but that's OK, because those will be explicit GOT references + generated by the code generator. + * When we refer to info tables from the data segment, it's either + * a FUN_STATIC/THUNK_STATIC local to this module + * a `con_info` that could be from anywhere + + So, the only info table symbols that we might refer to from the data segment + of another shared object are `con_info` symbols, so those are the ones we + need to exclude from getting the @function treatment. + " + + A good place to check for more + https://ghc.haskell.org/trac/ghc/wiki/Commentary/PositionIndependentCode + + Another possible hack is to create an extra local function symbol for + every code-like thing to give the needed information for to the tools + but mess up with the relocation. https://phabricator.haskell.org/D4730 + -} + functionOkInfoTable = tablesNextToCode dflags && + isInfoTableLabel lbl && not (isConInfoTableLabel lbl) + + +pprTypeDecl :: CLabel -> SDoc +pprTypeDecl lbl = sdocWithPlatform $ \platform -> if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl - then text ".type " <> ppr lbl <> ptext (sLit ", @object") + then + sdocWithDynFlags $ \df -> + text ".type " <> ppr lbl <> ptext (sLit ", ") <> pprLabelType' df lbl else empty pprLabel :: CLabel -> SDoc pprLabel lbl = pprGloblDecl lbl - $$ pprTypeAndSizeDecl lbl + $$ pprTypeDecl lbl $$ (ppr lbl <> char ':') +{- +Note [Pretty print ASCII when AsmCodeGen] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Previously, when generating assembly code, we created SDoc with +`(ptext . sLit)` for every bytes in literal bytestring, then +combine them using `hcat`. + +When handling literal bytestrings with millions of bytes, +millions of SDoc would be created and to combine, leading to +high memory usage. + +Now we escape the given bytestring to string directly and construct +SDoc only once. This improvement could dramatically decrease the +memory allocation from 4.7GB to 1.3GB when embedding a 3MB literal +string in source code. See Trac #14741 for profiling results. +-} pprASCII :: [Word8] -> SDoc pprASCII str - = hcat (map (do1 . fromIntegral) str) + -- Transform this given literal bytestring to escaped string and construct + -- the literal SDoc directly. + -- See Trac #14741 + -- and Note [Pretty print ASCII when AsmCodeGen] + = ptext $ sLit $ foldr (\w s -> (do1 . fromIntegral) w ++ s) "" str where - do1 :: Int -> SDoc - do1 w | '\t' <- chr w = ptext (sLit "\\t") - do1 w | '\n' <- chr w = ptext (sLit "\\n") - do1 w | '"' <- chr w = ptext (sLit "\\\"") - do1 w | '\\' <- chr w = ptext (sLit "\\\\") - do1 w | isPrint (chr w) = char (chr w) - do1 w | otherwise = char '\\' <> octal w - - octal :: Int -> SDoc - octal w = int ((w `div` 64) `mod` 8) - <> int ((w `div` 8) `mod` 8) - <> int (w `mod` 8) + do1 :: Int -> String + do1 w | '\t' <- chr w = "\\t" + | '\n' <- chr w = "\\n" + | '"' <- chr w = "\\\"" + | '\\' <- chr w = "\\\\" + | isPrint (chr w) = [chr w] + | otherwise = '\\' : octal w + + octal :: Int -> String + octal w = [ chr (ord '0' + (w `div` 64) `mod` 8) + , chr (ord '0' + (w `div` 8) `mod` 8) + , chr (ord '0' + w `mod` 8) + ] pprAlign :: Int -> SDoc pprAlign bytes @@ -505,7 +596,7 @@ pprDataItem' dflags lit -- case lit of -- A relative relocation: - CmmLabelDiffOff _ _ _ -> + CmmLabelDiffOff _ _ _ _ -> [text "\t.long\t" <> pprImm imm, text "\t.long\t0"] _ -> @@ -516,7 +607,7 @@ pprDataItem' dflags lit asmComment :: SDoc -> SDoc -asmComment c = ifPprDebug $ text "# " <> c +asmComment c = whenPprDebug $ text "# " <> c pprInstr :: Instr -> SDoc @@ -645,6 +736,9 @@ pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst) pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst) +pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst +pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst + pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src @@ -702,7 +796,7 @@ pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op) pprInstr (JXX cond blockid) = pprCondInstr (sLit "j") cond (ppr lab) - where lab = mkAsmTempLabel (getUnique blockid) + where lab = blockLbl blockid pprInstr (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm) @@ -1259,6 +1353,16 @@ pprFormatRegRegReg name format reg1 reg2 reg3 pprReg format reg3 ] +pprFormatOpOpReg :: LitString -> Format -> Operand -> Operand -> Reg -> SDoc +pprFormatOpOpReg name format op1 op2 reg3 + = hcat [ + pprMnemonic name format, + pprOperand format op1, + comma, + pprOperand format op2, + comma, + pprReg format reg3 + ] pprFormatAddrReg :: LitString -> Format -> AddrMode -> Reg -> SDoc pprFormatAddrReg name format op dst @@ -1302,4 +1406,3 @@ pprFormatOpOpCoerce name format1 format2 op1 op2 pprCondInstr :: LitString -> Cond -> SDoc -> SDoc pprCondInstr name cond arg = hcat [ char '\t', ptext name, pprCond cond, space, arg] - diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs index 4dfe0350d4..226441b16f 100644 --- a/compiler/nativeGen/X86/RegInfo.hs +++ b/compiler/nativeGen/X86/RegInfo.hs @@ -9,6 +9,8 @@ where #include "nativeGen/NCG.h" #include "HsVersions.h" +import GhcPrelude + import Format import Reg diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs index 4cb82ea224..97c3b984e2 100644 --- a/compiler/nativeGen/X86/Regs.hs +++ b/compiler/nativeGen/X86/Regs.hs @@ -48,6 +48,8 @@ where #include "nativeGen/NCG.h" #include "HsVersions.h" +import GhcPrelude + import CodeGen.Platform import Reg import RegClass @@ -58,8 +60,10 @@ import DynFlags import Outputable import Platform +import qualified Data.Array as A + -- | regSqueeze_class reg --- Calculuate the maximum number of register colors that could be +-- Calculate the maximum number of register colors that could be -- denied to a node of this class due to having this reg -- as a neighbour. -- @@ -142,7 +146,7 @@ litToImm (CmmFloat f W32) = ImmFloat f litToImm (CmmFloat f W64) = ImmDouble f litToImm (CmmLabel l) = ImmCLbl l litToImm (CmmLabelOff l off) = ImmIndex l off -litToImm (CmmLabelDiffOff l1 l2 off) +litToImm (CmmLabelDiffOff l1 l2 off _) = ImmConstantSum (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2)) (ImmInt off) @@ -234,7 +238,6 @@ xmmregnos platform = [firstxmm .. lastxmm platform] floatregnos :: Platform -> [RegNo] floatregnos platform = fakeregnos ++ xmmregnos platform - -- argRegs is the set of regs which are read for an n-argument call to C. -- For archs which pass all args on the stack (x86), is empty. -- Sparc passes up to the first 6 args in regs. @@ -267,13 +270,13 @@ showReg platform n | n >= firstxmm = "%xmm" ++ show (n-firstxmm) | n >= firstfake = "%fake" ++ show (n-firstfake) | n >= 8 = "%r" ++ show n - | otherwise = regNames platform !! n + | otherwise = regNames platform A.! n -regNames :: Platform -> [String] +regNames :: Platform -> A.Array Int String regNames platform = if target32Bit platform - then ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"] - else ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"] + then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"] + else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"] @@ -404,7 +407,10 @@ callClobberedRegs platform | target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform) | platformOS platform == OSMinGW32 = [rax,rcx,rdx,r8,r9,r10,r11] - ++ map regSingle (floatregnos platform) + -- Only xmm0-5 are caller-saves registers on 64bit windows. + -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage ) + -- For details check the Win64 ABI. + ++ map regSingle fakeregnos ++ map xmm [0 .. 5] | otherwise -- all xmm regs are caller-saves -- caller-saves registers |