summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Tibell <johan.tibell@gmail.com>2014-03-13 09:35:21 +0100
committerJohan Tibell <johan.tibell@gmail.com>2014-03-22 10:32:02 +0100
commit1eece45692fb5d1a5f4ec60c1537f8068237e9c1 (patch)
treeb5d99d52c5a6ab762f9b92dfd0504105122ed62b
parent99ef27913dbe55fa57891bbf97d131e0933733e3 (diff)
downloadhaskell-1eece45692fb5d1a5f4ec60c1537f8068237e9c1.tar.gz
codeGen: inline allocation optimization for clone array primops
The inline allocation version is 69% faster than the out-of-line version, when cloning an array of 16 unit elements on a 64-bit machine. Comparing the new and the old primop implementations isn't straightforward. The old version had a missing heap check that I discovered during the development of the new version. Comparing the old and the new version would requiring fixing the old version, which in turn means reimplementing the equivalent of MAYBE_CG in StgCmmPrim. The inline allocation threshold is configurable via -fmax-inline-alloc-size which gives the maximum array size, in bytes, to allocate inline. The size does not include the closure header size. Allowing the same primop to be either inline or out-of-line has some implication for how we lay out heap checks. We always place a heap check around out-of-line primops, as they may allocate outside of our knowledge. However, for the inline primops we only allow allocation via the standard means (i.e. virtHp). Since the clone primops might be either inline or out-of-line the heap check layout code now consults shouldInlinePrimOp to know whether a primop will be inlined.
-rw-r--r--compiler/cmm/CLabel.hs6
-rw-r--r--compiler/codeGen/StgCmmExpr.hs33
-rw-r--r--compiler/codeGen/StgCmmPrim.hs132
-rw-r--r--compiler/main/DynFlags.hs12
-rw-r--r--compiler/prelude/primops.txt.pp4
-rw-r--r--docs/users_guide/flags.xml13
-rw-r--r--includes/Cmm.h31
-rw-r--r--includes/stg/MiscClosures.h4
-rw-r--r--rts/Linker.c4
-rw-r--r--rts/PrimOps.cmm21
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun064.hs79
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun064.stdout8
-rw-r--r--testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs24
-rw-r--r--testsuite/tests/perf/should_run/all.T7
14 files changed, 279 insertions, 99 deletions
diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs
index 022792f1f4..7a9e32d270 100644
--- a/compiler/cmm/CLabel.hs
+++ b/compiler/cmm/CLabel.hs
@@ -54,6 +54,7 @@ module CLabel (
mkIndStaticInfoLabel,
mkMainCapabilityLabel,
mkMAP_FROZEN_infoLabel,
+ mkMAP_FROZEN0_infoLabel,
mkMAP_DIRTY_infoLabel,
mkEMPTY_MVAR_infoLabel,
mkArrWords_infoLabel,
@@ -401,7 +402,7 @@ mkStaticConEntryLabel name c = IdLabel name c StaticConEntry
-- Constructing Cmm Labels
mkDirty_MUT_VAR_Label, mkSplitMarkerLabel, mkUpdInfoLabel,
mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel,
- mkMAP_FROZEN_infoLabel, mkMAP_DIRTY_infoLabel,
+ mkMAP_FROZEN_infoLabel, mkMAP_FROZEN0_infoLabel, mkMAP_DIRTY_infoLabel,
mkEMPTY_MVAR_infoLabel, mkTopTickyCtrLabel,
mkCAFBlackHoleInfoTableLabel, mkCAFBlackHoleEntryLabel,
mkArrWords_infoLabel :: CLabel
@@ -411,7 +412,8 @@ mkUpdInfoLabel = CmmLabel rtsPackageId (fsLit "stg_upd_frame")
mkBHUpdInfoLabel = CmmLabel rtsPackageId (fsLit "stg_bh_upd_frame" ) CmmInfo
mkIndStaticInfoLabel = CmmLabel rtsPackageId (fsLit "stg_IND_STATIC") CmmInfo
mkMainCapabilityLabel = CmmLabel rtsPackageId (fsLit "MainCapability") CmmData
-mkMAP_FROZEN_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_FROZEN0") CmmInfo
+mkMAP_FROZEN_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_FROZEN") CmmInfo
+mkMAP_FROZEN0_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_FROZEN0") CmmInfo
mkMAP_DIRTY_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_DIRTY") CmmInfo
mkEMPTY_MVAR_infoLabel = CmmLabel rtsPackageId (fsLit "stg_EMPTY_MVAR") CmmInfo
mkTopTickyCtrLabel = CmmLabel rtsPackageId (fsLit "top_ct") CmmData
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs
index d94eca493e..9b9d6397c4 100644
--- a/compiler/codeGen/StgCmmExpr.hs
+++ b/compiler/codeGen/StgCmmExpr.hs
@@ -422,8 +422,8 @@ cgCase scrut bndr alt_type alts
; up_hp_usg <- getVirtHp -- Upstream heap usage
; let ret_bndrs = chooseReturnBndrs bndr alt_type alts
alt_regs = map (idToReg dflags) ret_bndrs
- simple_scrut = isSimpleScrut scrut alt_type
- do_gc | not simple_scrut = True
+ ; simple_scrut <- isSimpleScrut scrut alt_type
+ ; let do_gc | not simple_scrut = True
| isSingleton alts = False
| up_hp_usg > 0 = False
| otherwise = True
@@ -450,6 +450,13 @@ recover any unused heap before passing control to the sequel. If we
don't do this, then any unused heap will become slop because the heap
check will reset the heap usage. Slop in the heap breaks LDV profiling
(+RTS -hb) which needs to do a linear sweep through the nursery.
+
+
+Note [Inlining out-of-line primops and heap checks]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If shouldInlinePrimOp returns True when called from StgCmmExpr for the
+purpose of heap check placement, we *must* inline the primop later in
+StgCmmPrim. If we don't things will go wrong.
-}
-----------------
@@ -460,21 +467,25 @@ maybeSaveCostCentre simple_scrut
-----------------
-isSimpleScrut :: StgExpr -> AltType -> Bool
+isSimpleScrut :: StgExpr -> AltType -> FCode Bool
-- Simple scrutinee, does not block or allocate; hence safe to amalgamate
-- heap usage from alternatives into the stuff before the case
-- NB: if you get this wrong, and claim that the expression doesn't allocate
-- when it does, you'll deeply mess up allocation
-isSimpleScrut (StgOpApp op _ _) _ = isSimpleOp op
-isSimpleScrut (StgLit _) _ = True -- case 1# of { 0# -> ..; ... }
-isSimpleScrut (StgApp _ []) (PrimAlt _) = True -- case x# of { 0# -> ..; ... }
-isSimpleScrut _ _ = False
+isSimpleScrut (StgOpApp op args _) _ = isSimpleOp op args
+isSimpleScrut (StgLit _) _ = return True -- case 1# of { 0# -> ..; ... }
+isSimpleScrut (StgApp _ []) (PrimAlt _) = return True -- case x# of { 0# -> ..; ... }
+isSimpleScrut _ _ = return False
-isSimpleOp :: StgOp -> Bool
+isSimpleOp :: StgOp -> [StgArg] -> FCode Bool
-- True iff the op cannot block or allocate
-isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) = not (playSafe safe)
-isSimpleOp (StgPrimOp op) = not (primOpOutOfLine op)
-isSimpleOp (StgPrimCallOp _) = False
+isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) _ = return $! not (playSafe safe)
+isSimpleOp (StgPrimOp op) stg_args = do
+ arg_exprs <- getNonVoidArgAmodes stg_args
+ dflags <- getDynFlags
+ -- See Note [Inlining out-of-line primops and heap checks]
+ return $! isJust $ shouldInlinePrimOp dflags op arg_exprs
+isSimpleOp (StgPrimCallOp _) _ = return False
-----------------
chooseReturnBndrs :: Id -> AltType -> [StgAlt] -> [NonVoid Id]
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 28d50c1094..9a748da736 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -8,8 +8,9 @@
module StgCmmPrim (
cgOpApp,
- cgPrimOp -- internal(ish), used by cgCase to get code for a
- -- comparison without also turning it into a Bool.
+ cgPrimOp, -- internal(ish), used by cgCase to get code for a
+ -- comparison without also turning it into a Bool.
+ shouldInlinePrimOp
) where
#include "HsVersions.h"
@@ -41,7 +42,6 @@ import Outputable
import Util
import Control.Monad (liftM, when)
-import Data.Bits
------------------------------------------------------------------------
-- Primitive operations and foreign calls
@@ -132,12 +132,31 @@ shouldInlinePrimOp :: DynFlags
-> PrimOp -- ^ The primop
-> [CmmExpr] -- ^ The primop arguments
-> Maybe ([LocalReg] -> FCode ())
-shouldInlinePrimOp _ NewByteArrayOp_Char [(CmmLit (CmmInt n _))]
- | fromInteger n <= maxInlineAllocThreshold =
+
+shouldInlinePrimOp dflags NewByteArrayOp_Char [(CmmLit (CmmInt n _))]
+ | fromInteger n <= maxInlineAllocSize dflags =
Just $ \ [res] -> doNewByteArrayOp res (fromInteger n)
+
shouldInlinePrimOp dflags NewArrayOp [(CmmLit (CmmInt n _)), init]
- | wordsToBytes dflags (fromInteger n) <= maxInlineAllocThreshold =
+ | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags =
Just $ \ [res] -> doNewArrayOp res (fromInteger n) init
+
+shouldInlinePrimOp dflags CloneArrayOp [src, src_off, (CmmLit (CmmInt n _))]
+ | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags =
+ Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags CloneMutableArrayOp [src, src_off, (CmmLit (CmmInt n _))]
+ | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags =
+ Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags FreezeArrayOp [src, src_off, (CmmLit (CmmInt n _))]
+ | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags =
+ Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags ThawArrayOp [src, src_off, (CmmLit (CmmInt n _))]
+ | wordsToBytes dflags (fromInteger n) <= maxInlineAllocSize dflags =
+ Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
shouldInlinePrimOp dflags primop args
| primOpOutOfLine primop = Nothing
| otherwise = Just $ \ regs -> emitPrimOp dflags regs primop args
@@ -328,11 +347,11 @@ emitPrimOp dflags [res] DataToTagOp [arg]
-- }
emitPrimOp _ [res] UnsafeFreezeArrayOp [arg]
= emit $ catAGraphs
- [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_infoLabel)),
+ [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN0_infoLabel)),
mkAssign (CmmLocal res) arg ]
emitPrimOp _ [res] UnsafeFreezeArrayArrayOp [arg]
= emit $ catAGraphs
- [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_infoLabel)),
+ [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN0_infoLabel)),
mkAssign (CmmLocal res) arg ]
-- #define unsafeFreezzeByteArrayzh(r,a) r=(a)
@@ -345,15 +364,6 @@ emitPrimOp _ [] CopyArrayOp [src,src_off,dst,dst_off,n] =
doCopyArrayOp src src_off dst dst_off n
emitPrimOp _ [] CopyMutableArrayOp [src,src_off,dst,dst_off,n] =
doCopyMutableArrayOp src src_off dst dst_off n
-emitPrimOp _ [res] CloneArrayOp [src,src_off,n] =
- emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n
-emitPrimOp _ [res] CloneMutableArrayOp [src,src_off,n] =
- emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n
-emitPrimOp _ [res] FreezeArrayOp [src,src_off,n] =
- emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n
-emitPrimOp _ [res] ThawArrayOp [src,src_off,n] =
- emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n
-
emitPrimOp _ [] CopyArrayArrayOp [src,src_off,dst,dst_off,n] =
doCopyArrayOp src src_off dst dst_off n
emitPrimOp _ [] CopyMutableArrayArrayOp [src,src_off,dst,dst_off,n] =
@@ -1598,10 +1608,6 @@ doNewArrayOp res_r n init = do
emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
--- | The inline allocation limit is 128 bytes.
-maxInlineAllocThreshold :: ByteOff
-maxInlineAllocThreshold = 128
-
-- ----------------------------------------------------------------------------
-- Copying pointer arrays
@@ -1689,45 +1695,40 @@ emitCopyArray copy src0 src_off0 dst0 dst_off0 n0 = do
-- allocated array in, a source array, an offset in the source array,
-- and the number of elements to copy. Allocates a new array and
-- initializes it from the source array.
-emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> CmmExpr
+emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff
-> FCode ()
-emitCloneArray info_p res_r src0 src_off0 n0 = do
+emitCloneArray info_p res_r src src_off n = do
dflags <- getDynFlags
- let arrPtrsHdrSizeW dflags = mkIntExpr dflags (fixedHdrSize dflags +
- (sIZEOF_StgMutArrPtrs_NoHdr dflags `div` wORD_SIZE dflags))
- myCapability = cmmSubWord dflags (CmmReg baseReg) (mkIntExpr dflags (oFFSET_Capability_r dflags))
- -- Passed as arguments (be careful)
- src <- assignTempE src0
- src_off <- assignTempE src_off0
- n <- assignTempE n0
- card_bytes <- assignTempE $ cardRoundUpCmm dflags n
- size <- assignTempE $ cmmAddWord dflags n (bytesToWordsRoundUpCmm dflags card_bytes)
- words <- assignTempE $ cmmAddWord dflags (arrPtrsHdrSizeW dflags) size
+ let info_ptr = mkLblExpr info_p
+ rep = arrPtrsRep dflags n
- arr_r <- newTemp (bWord dflags)
- emitAllocateCall arr_r myCapability words
- tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags)) (cmmMulWord dflags n (wordSize dflags))
- (zeroExpr dflags)
+ tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags))
+ (mkIntExpr dflags (nonHdrSize dflags rep))
+ (zeroExpr dflags)
- let arr = CmmReg (CmmLocal arr_r)
- emitSetDynHdr arr (CmmLit (CmmLabel info_p)) curCCS
- emit $ mkStore (cmmOffsetB dflags arr (fixedHdrSize dflags * wORD_SIZE dflags +
- oFFSET_StgMutArrPtrs_ptrs dflags)) n
- emit $ mkStore (cmmOffsetB dflags arr (fixedHdrSize dflags * wORD_SIZE dflags +
- oFFSET_StgMutArrPtrs_size dflags)) size
+ let hdr_size = wordsToBytes dflags (fixedHdrSize dflags)
- dst_p <- assignTempE $ cmmOffsetB dflags arr (arrPtrsHdrSize dflags)
- src_p <- assignTempE $ cmmOffsetExprW dflags (cmmOffsetB dflags src (arrPtrsHdrSize dflags))
- src_off
+ base <- allocHeapClosure rep info_ptr curCCS
+ [ (mkIntExpr dflags n,
+ hdr_size + oFFSET_StgMutArrPtrs_ptrs dflags)
+ , (mkIntExpr dflags (nonHdrSizeW rep),
+ hdr_size + oFFSET_StgMutArrPtrs_size dflags)
+ ]
- emitMemcpyCall dst_p src_p (cmmMulWord dflags n (wordSize dflags)) (mkIntExpr dflags (wORD_SIZE dflags))
+ arr <- CmmLocal `fmap` newTemp (bWord dflags)
+ emit $ mkAssign arr base
- emitMemsetCall (cmmOffsetExprW dflags dst_p n)
- (mkIntExpr dflags 1)
- card_bytes
+ dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr)
+ (arrPtrsHdrSize dflags)
+ src_p <- assignTempE $ cmmOffsetExprW dflags src
+ (cmmAddWord dflags
+ (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off)
+
+ emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
(mkIntExpr dflags (wORD_SIZE dflags))
- emit $ mkAssign (CmmLocal res_r) arr
+
+ emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
-- | Takes and offset in the destination array, the base address of
-- the card table, and the number of elements affected (*not* the
@@ -1748,22 +1749,6 @@ cardCmm :: DynFlags -> CmmExpr -> CmmExpr
cardCmm dflags i =
cmmUShrWord dflags i (mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags))
--- Convert a number of elements to a number of cards, rounding up
-cardRoundUpCmm :: DynFlags -> CmmExpr -> CmmExpr
-cardRoundUpCmm dflags i =
- cardCmm dflags (cmmAddWord dflags i
- (mkIntExpr dflags
- ((1 `shiftL` mUT_ARR_PTRS_CARD_BITS dflags) - 1)))
-
-bytesToWordsRoundUpCmm :: DynFlags -> CmmExpr -> CmmExpr
-bytesToWordsRoundUpCmm dflags e =
- cmmQuotWord dflags (cmmAddWord dflags e
- (mkIntExpr dflags
- (wORD_SIZE dflags - 1))) (wordSize dflags)
-
-wordSize :: DynFlags -> CmmExpr
-wordSize dflags = mkIntExpr dflags (wORD_SIZE dflags)
-
-- | Emit a call to @memcpy@.
emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
emitMemcpyCall dst src n align = do
@@ -1789,19 +1774,6 @@ emitMemsetCall dst c n align = do
MO_Memset
[ dst, c, n, align ]
--- | Emit a call to @allocate@.
-emitAllocateCall :: LocalReg -> CmmExpr -> CmmExpr -> FCode ()
-emitAllocateCall res cap n = do
- emitCCall
- [ (res, AddrHint) ]
- allocate
- [ (cap, AddrHint)
- , (n, NoHint)
- ]
- where
- allocate = CmmLit (CmmLabel (mkForeignLabel (fsLit "allocate") Nothing
- ForeignLabelInExternalPackage IsFunction))
-
emitBSwapCall :: LocalReg -> CmmExpr -> Width -> FCode ()
emitBSwapCall res x width = do
emitPrimCall
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index dcdc4b556d..dded24fba3 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -799,7 +799,12 @@ data DynFlags = DynFlags {
rtldInfo :: IORef (Maybe LinkerInfo),
-- | Run-time compiler information
- rtccInfo :: IORef (Maybe CompilerInfo)
+ rtccInfo :: IORef (Maybe CompilerInfo),
+
+ -- Constants used to control the amount of optimization done.
+
+ -- ^ Max size, in bytes, of inline array allocations.
+ maxInlineAllocSize :: Int
}
class HasDynFlags m where
@@ -1448,7 +1453,9 @@ defaultDynFlags mySettings =
avx512f = False,
avx512pf = False,
rtldInfo = panic "defaultDynFlags: no rtldInfo",
- rtccInfo = panic "defaultDynFlags: no rtccInfo"
+ rtccInfo = panic "defaultDynFlags: no rtccInfo",
+
+ maxInlineAllocSize = 128
}
defaultWays :: Settings -> [Way]
@@ -2428,6 +2435,7 @@ dynamic_flags = [
, Flag "fmax-worker-args" (intSuffix (\n d -> d {maxWorkerArgs = n}))
, Flag "fghci-hist-size" (intSuffix (\n d -> d {ghciHistSize = n}))
+ , Flag "fmax-inline-alloc-size" (intSuffix (\n d -> d{ maxInlineAllocSize = n }))
------ Profiling ----------------------------------------------------
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 49fef3523a..e1a9824a16 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -794,6 +794,7 @@ primop CloneArrayOp "cloneArray#" GenPrimOp
source array. The provided array must fully contain the specified
range, but this is not checked.}
with
+ out_of_line = True
has_side_effects = True
code_size = { primOpCodeSizeForeignCall + 4 }
@@ -804,6 +805,7 @@ primop CloneMutableArrayOp "cloneMutableArray#" GenPrimOp
source array. The provided array must fully contain the specified
range, but this is not checked.}
with
+ out_of_line = True
has_side_effects = True
code_size = { primOpCodeSizeForeignCall + 4 }
@@ -814,6 +816,7 @@ primop FreezeArrayOp "freezeArray#" GenPrimOp
source array. The provided array must fully contain the specified
range, but this is not checked.}
with
+ out_of_line = True
has_side_effects = True
code_size = { primOpCodeSizeForeignCall + 4 }
@@ -824,6 +827,7 @@ primop ThawArrayOp "thawArray#" GenPrimOp
source array. The provided array must fully contain the specified
range, but this is not checked.}
with
+ out_of_line = True
has_side_effects = True
code_size = { primOpCodeSizeForeignCall + 4 }
diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml
index d932813302..b4febf587b 100644
--- a/docs/users_guide/flags.xml
+++ b/docs/users_guide/flags.xml
@@ -1887,6 +1887,19 @@
<entry><option>-fno-unfolding-use-threshold</option></entry>
</row>
+ <row>
+ <entry><option>-fmax-inline-alloc-size</option>=<replaceable>n</replaceable></entry>
+ <entry>Set the maximum size of inline array allocations to
+ <replaceable>n</replaceable> bytes (default: 128). GHC
+ will allocate non-pinned arrays of statically known size
+ in the current nursery block if they're no bigger than
+ <replaceable>n</replaceable> bytes, ignoring GC overheap.
+ This value should be quite a bit smaller than the block
+ size (typically: 4096).</entry>
+ <entry>dynamic</entry>
+ <entry>-</entry>
+ </row>
+
</tbody>
</tgroup>
</informaltable>
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 0e30c1657d..aa868cf431 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -806,4 +806,35 @@
__gen = TO_W_(bdescr_gen_no(__bd)); \
if (__gen > 0) { recordMutableCap(__p, __gen); }
+/* Complete function body for the clone family of (mutable) array ops.
+ Defined as a macro to avoid function call overhead or code
+ duplication. */
+#define cloneArray(info, src, offset, n) \
+ W_ words, size; \
+ gcptr dst, dst_p, src_p; \
+ \
+ again: MAYBE_GC(again); \
+ \
+ size = n + mutArrPtrsCardWords(n); \
+ words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size; \
+ ("ptr" dst) = ccall allocate(MyCapability() "ptr", words); \
+ TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0); \
+ \
+ SET_HDR(dst, info, CCCS); \
+ StgMutArrPtrs_ptrs(dst) = n; \
+ StgMutArrPtrs_size(dst) = size; \
+ \
+ dst_p = dst + SIZEOF_StgMutArrPtrs; \
+ src_p = src + SIZEOF_StgMutArrPtrs + WDS(offset); \
+ while: \
+ if (n != 0) { \
+ n = n - 1; \
+ W_[dst_p] = W_[src_p]; \
+ dst_p = dst_p + WDS(1); \
+ src_p = src_p + WDS(1); \
+ goto while; \
+ } \
+ \
+ return (dst);
+
#endif /* CMM_H */
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index ff781dd4ec..8be51fb036 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -347,6 +347,10 @@ RTS_FUN_DECL(stg_casIntArrayzh);
RTS_FUN_DECL(stg_fetchAddIntArrayzh);
RTS_FUN_DECL(stg_newArrayzh);
RTS_FUN_DECL(stg_newArrayArrayzh);
+RTS_FUN_DECL(stg_cloneArrayzh);
+RTS_FUN_DECL(stg_cloneMutableArrayzh);
+RTS_FUN_DECL(stg_freezzeArrayzh);
+RTS_FUN_DECL(stg_thawArrayzh);
RTS_FUN_DECL(stg_newMutVarzh);
RTS_FUN_DECL(stg_atomicModifyMutVarzh);
diff --git a/rts/Linker.c b/rts/Linker.c
index 814f930fe4..fee6124965 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -1162,6 +1162,10 @@ typedef struct _RtsSymbolVal {
SymI_HasProto(stg_myThreadIdzh) \
SymI_HasProto(stg_labelThreadzh) \
SymI_HasProto(stg_newArrayzh) \
+ SymI_HasProto(stg_cloneArrayzh) \
+ SymI_HasProto(stg_cloneMutableArrayzh) \
+ SymI_HasProto(stg_freezzeArrayzh) \
+ SymI_HasProto(stg_thawArrayzh) \
SymI_HasProto(stg_newArrayArrayzh) \
SymI_HasProto(stg_casArrayzh) \
SymI_HasProto(stg_newBCOzh) \
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 25e6534118..5bdccfa9f7 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -225,6 +225,27 @@ stg_unsafeThawArrayzh ( gcptr arr )
}
}
+stg_cloneArrayzh ( gcptr src, W_ offset, W_ n )
+{
+ cloneArray(stg_MUT_ARR_PTRS_FROZEN_info, src, offset, n)
+}
+
+stg_cloneMutableArrayzh ( gcptr src, W_ offset, W_ n )
+{
+ cloneArray(stg_MUT_ARR_PTRS_DIRTY_info, src, offset, n)
+}
+
+// We have to escape the "z" in the name.
+stg_freezzeArrayzh ( gcptr src, W_ offset, W_ n )
+{
+ cloneArray(stg_MUT_ARR_PTRS_FROZEN_info, src, offset, n)
+}
+
+stg_thawArrayzh ( gcptr src, W_ offset, W_ n )
+{
+ cloneArray(stg_MUT_ARR_PTRS_DIRTY_info, src, offset, n)
+}
+
// RRN: Uses the ticketed approach; see casMutVar
stg_casArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new )
/* MutableArray# s a -> Int# -> a -> a -> State# s -> (# State# s, Int#, Any a #) */
diff --git a/testsuite/tests/codeGen/should_run/cgrun064.hs b/testsuite/tests/codeGen/should_run/cgrun064.hs
index 24544c4382..527c6bde67 100644
--- a/testsuite/tests/codeGen/should_run/cgrun064.hs
+++ b/testsuite/tests/codeGen/should_run/cgrun064.hs
@@ -9,15 +9,20 @@ import GHC.Exts hiding (IsList(..))
import GHC.Prim
import GHC.ST
+main :: IO ()
main = putStr
(test_copyArray
++ "\n" ++ test_copyMutableArray
++ "\n" ++ test_copyMutableArrayOverlap
++ "\n" ++ test_cloneArray
+ ++ "\n" ++ test_cloneArrayStatic
++ "\n" ++ test_cloneMutableArray
++ "\n" ++ test_cloneMutableArrayEmpty
+ ++ "\n" ++ test_cloneMutableArrayStatic
++ "\n" ++ test_freezeArray
+ ++ "\n" ++ test_freezeArrayStatic
++ "\n" ++ test_thawArray
+ ++ "\n" ++ test_thawArrayStatic
++ "\n"
)
@@ -32,6 +37,10 @@ len = 130
copied :: Int
copied = len - 2
+copiedStatic :: Int
+copiedStatic = 16
+{-# INLINE copiedStatic #-} -- to make sure optimization triggers
+
------------------------------------------------------------------------
-- copyArray#
@@ -90,9 +99,20 @@ test_cloneArray =
fill src 0 len
src <- unsafeFreezeArray src
-- Don't include the first and last element.
- return $ cloneArray src 1 copied
+ return $! cloneArray src 1 copied
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_cloneArrayStatic :: String
+test_cloneArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ src <- unsafeFreezeArray src
+ -- Don't include the first and last element.
+ return $! cloneArray src 1 copiedStatic
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- cloneMutableArray#
@@ -117,6 +137,17 @@ test_cloneMutableArrayEmpty =
unsafeFreezeArray dst
in shows (toList dst 0) "\n"
+-- Check that the static-size optimization works.
+test_cloneMutableArrayStatic :: String
+test_cloneMutableArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ -- Don't include the first and last element.
+ dst <- cloneMutableArray src 1 copiedStatic
+ unsafeFreezeArray dst
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- freezeArray#
@@ -131,6 +162,16 @@ test_freezeArray =
freezeArray src 1 copied
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_freezeArrayStatic :: String
+test_freezeArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ -- Don't include the first and last element.
+ freezeArray src 1 copiedStatic
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- thawArray#
@@ -147,6 +188,18 @@ test_thawArray =
unsafeFreezeArray dst
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_thawArrayStatic :: String
+test_thawArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ src <- unsafeFreezeArray src
+ -- Don't include the first and last element.
+ dst <- thawArray src 1 copiedStatic
+ unsafeFreezeArray dst
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- Test helpers
@@ -181,13 +234,27 @@ newArray (I# n#) a = ST $ \s# -> case newArray# n# a s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
indexArray :: Array a -> Int -> a
-indexArray arr (I# i#) = case indexArray# (unArray arr) i# of
- (# a #) -> a
+indexArray arr i@(I# i#)
+ | i < 0 || i >= len =
+ error $ "bounds error, offset " ++ show i ++ ", length " ++ show len
+ | otherwise = case indexArray# (unArray arr) i# of
+ (# a #) -> a
+ where len = lengthArray arr
writeArray :: MArray s a -> Int -> a -> ST s ()
-writeArray marr (I# i#) a = ST $ \ s# ->
+writeArray marr i@(I# i#) a
+ | i < 0 || i >= len =
+ error $ "bounds error, offset " ++ show i ++ ", length " ++ show len
+ | otherwise = ST $ \ s# ->
case writeArray# (unMArray marr) i# a s# of
s2# -> (# s2#, () #)
+ where len = lengthMArray marr
+
+lengthArray :: Array a -> Int
+lengthArray arr = I# (sizeofArray# (unArray arr))
+
+lengthMArray :: MArray s a -> Int
+lengthMArray marr = I# (sizeofMutableArray# (unMArray marr))
unsafeFreezeArray :: MArray s a -> ST s (Array a)
unsafeFreezeArray marr = ST $ \ s# ->
@@ -206,21 +273,25 @@ copyMutableArray src (I# six#) dst (I# dix#) (I# n#) = ST $ \ s# ->
cloneArray :: Array a -> Int -> Int -> Array a
cloneArray src (I# six#) (I# n#) = Array (cloneArray# (unArray src) six# n#)
+{-# INLINE cloneArray #-} -- to make sure optimization triggers
cloneMutableArray :: MArray s a -> Int -> Int -> ST s (MArray s a)
cloneMutableArray src (I# six#) (I# n#) = ST $ \ s# ->
case cloneMutableArray# (unMArray src) six# n# s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
+{-# INLINE cloneMutableArray #-} -- to make sure optimization triggers
freezeArray :: MArray s a -> Int -> Int -> ST s (Array a)
freezeArray src (I# six#) (I# n#) = ST $ \ s# ->
case freezeArray# (unMArray src) six# n# s# of
(# s2#, arr# #) -> (# s2#, Array arr# #)
+{-# INLINE freezeArray #-} -- to make sure optimization triggers
thawArray :: Array a -> Int -> Int -> ST s (MArray s a)
thawArray src (I# six#) (I# n#) = ST $ \ s# ->
case thawArray# (unArray src) six# n# s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
+{-# INLINE thawArray #-} -- to make sure optimization triggers
toList :: Array a -> Int -> [a]
toList arr n = go 0
diff --git a/testsuite/tests/codeGen/should_run/cgrun064.stdout b/testsuite/tests/codeGen/should_run/cgrun064.stdout
index 8e741ceec6..86ad8a276c 100644
--- a/testsuite/tests/codeGen/should_run/cgrun064.stdout
+++ b/testsuite/tests/codeGen/should_run/cgrun064.stdout
@@ -6,11 +6,19 @@
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
[]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
diff --git a/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs b/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs
new file mode 100644
index 0000000000..54243fe793
--- /dev/null
+++ b/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs
@@ -0,0 +1,24 @@
+{-# LANGUAGE MagicHash, UnboxedTuples #-}
+module Main where
+
+import GHC.Exts
+import GHC.IO
+
+main :: IO ()
+main = do
+ marr <- newArray
+ loop 10000000 (unMArray marr)
+ where
+ loop :: Int -> MutableArray# RealWorld () -> IO ()
+ loop 0 _ = return ()
+ loop i marr = freezeArray marr >> loop (i-1) marr
+
+data MArray = MArray { unMArray :: !(MutableArray# RealWorld ()) }
+
+newArray :: IO MArray
+newArray = IO $ \s -> case newArray# 16# () s of
+ (# s', marr# #) -> (# s', MArray marr# #)
+
+freezeArray :: MutableArray# RealWorld () -> IO ()
+freezeArray marr# = IO $ \s -> case freezeArray# marr# 0# 16# s of
+ (# s', _ #) -> (# s', () #)
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
index 14be74ed9d..1e1b6ccba8 100644
--- a/testsuite/tests/perf/should_run/all.T
+++ b/testsuite/tests/perf/should_run/all.T
@@ -344,3 +344,10 @@ test('InlineByteArrayAlloc',
only_ways(['normal'])],
compile_and_run,
['-O2'])
+
+test('InlineCloneArrayAlloc',
+ [stats_num_field('bytes allocated',
+ [ (wordsize(64), 1600041120, 5)]),
+ only_ways(['normal'])],
+ compile_and_run,
+ ['-O2'])