diff options
| author | Johan Tibell <johan.tibell@gmail.com> | 2014-03-13 09:35:21 +0100 |
|---|---|---|
| committer | Johan Tibell <johan.tibell@gmail.com> | 2014-03-22 10:32:02 +0100 |
| commit | 1eece45692fb5d1a5f4ec60c1537f8068237e9c1 (patch) | |
| tree | b5d99d52c5a6ab762f9b92dfd0504105122ed62b /compiler/codeGen | |
| parent | 99ef27913dbe55fa57891bbf97d131e0933733e3 (diff) | |
| download | haskell-1eece45692fb5d1a5f4ec60c1537f8068237e9c1.tar.gz | |
codeGen: inline allocation optimization for clone array primops
The inline allocation version is 69% faster than the out-of-line
version, when cloning an array of 16 unit elements on a 64-bit
machine.
Comparing the new and the old primop implementations isn't
straightforward. The old version had a missing heap check that I
discovered during the development of the new version. Comparing the
old and the new version would requiring fixing the old version, which
in turn means reimplementing the equivalent of MAYBE_CG in StgCmmPrim.
The inline allocation threshold is configurable via
-fmax-inline-alloc-size which gives the maximum array size, in bytes,
to allocate inline. The size does not include the closure header size.
Allowing the same primop to be either inline or out-of-line has some
implication for how we lay out heap checks. We always place a heap
check around out-of-line primops, as they may allocate outside of our
knowledge. However, for the inline primops we only allow allocation
via the standard means (i.e. virtHp). Since the clone primops might be
either inline or out-of-line the heap check layout code now consults
shouldInlinePrimOp to know whether a primop will be inlined.
Diffstat (limited to 'compiler/codeGen')
| -rw-r--r-- | compiler/codeGen/StgCmmExpr.hs | 33 | ||||
| -rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 132 |
2 files changed, 74 insertions, 91 deletions
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs index d94eca493e..9b9d6397c4 100644 --- a/compiler/codeGen/StgCmmExpr.hs +++ b/compiler/codeGen/StgCmmExpr.hs @@ -422,8 +422,8 @@ cgCase scrut bndr alt_type alts ; up_hp_usg <- getVirtHp -- Upstream heap usage ; let ret_bndrs = chooseReturnBndrs bndr alt_type alts alt_regs = map (idToReg dflags) ret_bndrs - simple_scrut = isSimpleScrut scrut alt_type - do_gc | not simple_scrut = True + ; simple_scrut <- isSimpleScrut scrut alt_type + ; let do_gc | not simple_scrut = True | isSingleton alts = False | up_hp_usg > 0 = False | otherwise = True @@ -450,6 +450,13 @@ recover any unused heap before passing control to the sequel. If we don't do this, then any unused heap will become slop because the heap check will reset the heap usage. Slop in the heap breaks LDV profiling (+RTS -hb) which needs to do a linear sweep through the nursery. + + +Note [Inlining out-of-line primops and heap checks] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If shouldInlinePrimOp returns True when called from StgCmmExpr for the +purpose of heap check placement, we *must* inline the primop later in +StgCmmPrim. If we don't things will go wrong. -} ----------------- @@ -460,21 +467,25 @@ maybeSaveCostCentre simple_scrut ----------------- -isSimpleScrut :: StgExpr -> AltType -> Bool +isSimpleScrut :: StgExpr -> AltType -> FCode Bool -- Simple scrutinee, does not block or allocate; hence safe to amalgamate -- heap usage from alternatives into the stuff before the case -- NB: if you get this wrong, and claim that the expression doesn't allocate -- when it does, you'll deeply mess up allocation -isSimpleScrut (StgOpApp op _ _) _ = isSimpleOp op -isSimpleScrut (StgLit _) _ = True -- case 1# of { 0# -> ..; ... } -isSimpleScrut (StgApp _ []) (PrimAlt _) = True -- case x# of { 0# -> ..; ... } -isSimpleScrut _ _ = False +isSimpleScrut (StgOpApp op args _) _ = isSimpleOp op args +isSimpleScrut (StgLit _) _ = return True -- case 1# of { 0# -> ..; ... } +isSimpleScrut (StgApp _ []) (PrimAlt _) = return True -- case x# of { 0# -> ..; ... } +isSimpleScrut _ _ = return False -isSimpleOp :: StgOp -> Bool +isSimpleOp :: StgOp -> [StgArg] -> FCode Bool -- True iff the op cannot block or allocate -isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) = not (playSafe safe) -isSimpleOp (StgPrimOp op) = not (primOpOutOfLine op) -isSimpleOp (StgPrimCallOp _) = False +isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) _ = return $! not (playSafe safe) +isSimpleOp (StgPrimOp op) stg_args = do + arg_exprs <- getNonVoidArgAmodes stg_args + dflags <- getDynFlags + -- See Note [Inlining out-of-line primops and heap checks] + return $! isJust $ shouldInlinePrimOp dflags op arg_exprs +isSimpleOp (StgPrimCallOp _) _ = return False ----------------- chooseReturnBndrs :: Id -> AltType -> [StgAlt] -> [NonVoid Id] diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 28d50c1094..9a748da736 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -8,8 +8,9 @@ module StgCmmPrim ( cgOpApp, - cgPrimOp -- internal(ish), used by cgCase to get code for a - -- comparison without also turning it into a Bool. + cgPrimOp, -- internal(ish), used by cgCase to get code for a + -- comparison without also turning it into a Bool. + shouldInlinePrimOp ) where #include "HsVersions.h" @@ -41,7 +42,6 @@ import Outputable import Util import Control.Monad (liftM, when) -import Data.Bits ------------------------------------------------------------------------ -- Primitive operations and foreign calls @@ -132,12 +132,31 @@ shouldInlinePrimOp :: DynFlags -> PrimOp -- ^ The primop -> [CmmExpr] -- ^ The primop arguments -> Maybe ([LocalReg] -> FCode ()) -shouldInlinePrimOp _ NewByteArrayOp_Char [(CmmLit (CmmInt n _))] - | fromInteger n <= maxInlineAllocThreshold = + +shouldInlinePrimOp dflags NewByteArrayOp_Char [(CmmLit (CmmInt n _))] + | fromInteger n <= maxInlineAllocSize dflags = Just $ \ [res] -> doNewByteArrayOp res (fromInteger n) + shouldInlinePrimOp dflags NewArrayOp [(CmmLit (CmmInt n _)), init] - | wordsToBytes dflags (fromInteger n) <= maxInlineAllocThreshold = + | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags = Just $ \ [res] -> doNewArrayOp res (fromInteger n) init + +shouldInlinePrimOp dflags CloneArrayOp [src, src_off, (CmmLit (CmmInt n _))] + | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags CloneMutableArrayOp [src, src_off, (CmmLit (CmmInt n _))] + | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags FreezeArrayOp [src, src_off, (CmmLit (CmmInt n _))] + | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags = + Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_infoLabel res src src_off (fromInteger n) + +shouldInlinePrimOp dflags ThawArrayOp [src, src_off, (CmmLit (CmmInt n _))] + | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags = + Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n) + shouldInlinePrimOp dflags primop args | primOpOutOfLine primop = Nothing | otherwise = Just $ \ regs -> emitPrimOp dflags regs primop args @@ -328,11 +347,11 @@ emitPrimOp dflags [res] DataToTagOp [arg] -- } emitPrimOp _ [res] UnsafeFreezeArrayOp [arg] = emit $ catAGraphs - [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_infoLabel)), + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN0_infoLabel)), mkAssign (CmmLocal res) arg ] emitPrimOp _ [res] UnsafeFreezeArrayArrayOp [arg] = emit $ catAGraphs - [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_infoLabel)), + [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN0_infoLabel)), mkAssign (CmmLocal res) arg ] -- #define unsafeFreezzeByteArrayzh(r,a) r=(a) @@ -345,15 +364,6 @@ emitPrimOp _ [] CopyArrayOp [src,src_off,dst,dst_off,n] = doCopyArrayOp src src_off dst dst_off n emitPrimOp _ [] CopyMutableArrayOp [src,src_off,dst,dst_off,n] = doCopyMutableArrayOp src src_off dst dst_off n -emitPrimOp _ [res] CloneArrayOp [src,src_off,n] = - emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n -emitPrimOp _ [res] CloneMutableArrayOp [src,src_off,n] = - emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n -emitPrimOp _ [res] FreezeArrayOp [src,src_off,n] = - emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n -emitPrimOp _ [res] ThawArrayOp [src,src_off,n] = - emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n - emitPrimOp _ [] CopyArrayArrayOp [src,src_off,dst,dst_off,n] = doCopyArrayOp src src_off dst dst_off n emitPrimOp _ [] CopyMutableArrayArrayOp [src,src_off,dst,dst_off,n] = @@ -1598,10 +1608,6 @@ doNewArrayOp res_r n init = do emit $ mkAssign (CmmLocal res_r) (CmmReg arr) --- | The inline allocation limit is 128 bytes. -maxInlineAllocThreshold :: ByteOff -maxInlineAllocThreshold = 128 - -- ---------------------------------------------------------------------------- -- Copying pointer arrays @@ -1689,45 +1695,40 @@ emitCopyArray copy src0 src_off0 dst0 dst_off0 n0 = do -- allocated array in, a source array, an offset in the source array, -- and the number of elements to copy. Allocates a new array and -- initializes it from the source array. -emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> CmmExpr +emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff -> FCode () -emitCloneArray info_p res_r src0 src_off0 n0 = do +emitCloneArray info_p res_r src src_off n = do dflags <- getDynFlags - let arrPtrsHdrSizeW dflags = mkIntExpr dflags (fixedHdrSize dflags + - (sIZEOF_StgMutArrPtrs_NoHdr dflags `div` wORD_SIZE dflags)) - myCapability = cmmSubWord dflags (CmmReg baseReg) (mkIntExpr dflags (oFFSET_Capability_r dflags)) - -- Passed as arguments (be careful) - src <- assignTempE src0 - src_off <- assignTempE src_off0 - n <- assignTempE n0 - card_bytes <- assignTempE $ cardRoundUpCmm dflags n - size <- assignTempE $ cmmAddWord dflags n (bytesToWordsRoundUpCmm dflags card_bytes) - words <- assignTempE $ cmmAddWord dflags (arrPtrsHdrSizeW dflags) size + let info_ptr = mkLblExpr info_p + rep = arrPtrsRep dflags n - arr_r <- newTemp (bWord dflags) - emitAllocateCall arr_r myCapability words - tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags)) (cmmMulWord dflags n (wordSize dflags)) - (zeroExpr dflags) + tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags)) + (mkIntExpr dflags (nonHdrSize dflags rep)) + (zeroExpr dflags) - let arr = CmmReg (CmmLocal arr_r) - emitSetDynHdr arr (CmmLit (CmmLabel info_p)) curCCS - emit $ mkStore (cmmOffsetB dflags arr (fixedHdrSize dflags * wORD_SIZE dflags + - oFFSET_StgMutArrPtrs_ptrs dflags)) n - emit $ mkStore (cmmOffsetB dflags arr (fixedHdrSize dflags * wORD_SIZE dflags + - oFFSET_StgMutArrPtrs_size dflags)) size + let hdr_size = wordsToBytes dflags (fixedHdrSize dflags) - dst_p <- assignTempE $ cmmOffsetB dflags arr (arrPtrsHdrSize dflags) - src_p <- assignTempE $ cmmOffsetExprW dflags (cmmOffsetB dflags src (arrPtrsHdrSize dflags)) - src_off + base <- allocHeapClosure rep info_ptr curCCS + [ (mkIntExpr dflags n, + hdr_size + oFFSET_StgMutArrPtrs_ptrs dflags) + , (mkIntExpr dflags (nonHdrSizeW rep), + hdr_size + oFFSET_StgMutArrPtrs_size dflags) + ] - emitMemcpyCall dst_p src_p (cmmMulWord dflags n (wordSize dflags)) (mkIntExpr dflags (wORD_SIZE dflags)) + arr <- CmmLocal `fmap` newTemp (bWord dflags) + emit $ mkAssign arr base - emitMemsetCall (cmmOffsetExprW dflags dst_p n) - (mkIntExpr dflags 1) - card_bytes + dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr) + (arrPtrsHdrSize dflags) + src_p <- assignTempE $ cmmOffsetExprW dflags src + (cmmAddWord dflags + (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off) + + emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n)) (mkIntExpr dflags (wORD_SIZE dflags)) - emit $ mkAssign (CmmLocal res_r) arr + + emit $ mkAssign (CmmLocal res_r) (CmmReg arr) -- | Takes and offset in the destination array, the base address of -- the card table, and the number of elements affected (*not* the @@ -1748,22 +1749,6 @@ cardCmm :: DynFlags -> CmmExpr -> CmmExpr cardCmm dflags i = cmmUShrWord dflags i (mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)) --- Convert a number of elements to a number of cards, rounding up -cardRoundUpCmm :: DynFlags -> CmmExpr -> CmmExpr -cardRoundUpCmm dflags i = - cardCmm dflags (cmmAddWord dflags i - (mkIntExpr dflags - ((1 `shiftL` mUT_ARR_PTRS_CARD_BITS dflags) - 1))) - -bytesToWordsRoundUpCmm :: DynFlags -> CmmExpr -> CmmExpr -bytesToWordsRoundUpCmm dflags e = - cmmQuotWord dflags (cmmAddWord dflags e - (mkIntExpr dflags - (wORD_SIZE dflags - 1))) (wordSize dflags) - -wordSize :: DynFlags -> CmmExpr -wordSize dflags = mkIntExpr dflags (wORD_SIZE dflags) - -- | Emit a call to @memcpy@. emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () emitMemcpyCall dst src n align = do @@ -1789,19 +1774,6 @@ emitMemsetCall dst c n align = do MO_Memset [ dst, c, n, align ] --- | Emit a call to @allocate@. -emitAllocateCall :: LocalReg -> CmmExpr -> CmmExpr -> FCode () -emitAllocateCall res cap n = do - emitCCall - [ (res, AddrHint) ] - allocate - [ (cap, AddrHint) - , (n, NoHint) - ] - where - allocate = CmmLit (CmmLabel (mkForeignLabel (fsLit "allocate") Nothing - ForeignLabelInExternalPackage IsFunction)) - emitBSwapCall :: LocalReg -> CmmExpr -> Width -> FCode () emitBSwapCall res x width = do emitPrimCall |
