diff options
author | Iavor S. Diatchki <iavor.diatchki@gmail.com> | 2014-07-19 14:29:57 -0700 |
---|---|---|
committer | Iavor S. Diatchki <iavor.diatchki@gmail.com> | 2014-07-19 14:29:57 -0700 |
commit | 524634641c61ab42c555452f6f87119b27f6c331 (patch) | |
tree | f78d17bb6b09fb3b2e22cb4d93c2a3d45accc2d9 /compiler/cmm/CmmSink.hs | |
parent | 79ad1d20c5500e17ce5daaf93b171131669bddad (diff) | |
parent | c41b716d82b1722f909979d02a76e21e9b68886c (diff) | |
download | haskell-wip/ext-solver.tar.gz |
Merge branch 'master' into wip/ext-solverwip/ext-solver
Diffstat (limited to 'compiler/cmm/CmmSink.hs')
-rw-r--r-- | compiler/cmm/CmmSink.hs | 116 |
1 files changed, 104 insertions, 12 deletions
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs index 187f4c47df..4dced9afd2 100644 --- a/compiler/cmm/CmmSink.hs +++ b/compiler/cmm/CmmSink.hs @@ -9,6 +9,7 @@ import BlockId import CmmLive import CmmUtils import Hoopl +import CodeGen.Platform import DynFlags import UniqFM @@ -16,6 +17,7 @@ import PprCmm () import Data.List (partition) import qualified Data.Set as Set +import Data.Maybe -- ----------------------------------------------------------------------------- -- Sinking and inlining @@ -197,7 +199,7 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets') where should_drop = conflicts dflags a final_last - || not (isTrivial rhs) && live_in_multi live_sets r + || not (isTrivial dflags rhs) && live_in_multi live_sets r || r `Set.member` live_in_joins live_sets' | should_drop = live_sets @@ -219,26 +221,24 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks -- small: an expression we don't mind duplicating isSmall :: CmmExpr -> Bool -isSmall (CmmReg (CmmLocal _)) = True -- not globals, we want to coalesce them instead* See below +isSmall (CmmReg (CmmLocal _)) = True -- isSmall (CmmLit _) = True isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y isSmall (CmmRegOff (CmmLocal _) _) = True isSmall _ = False - -Coalesce global registers? What does that mean? We observed no decrease -in performance comming from inlining of global registers, hence we do it now -(see isTrivial function). Ideally we'd like to measure performance using -some tool like perf or VTune and make decisions what to inline based on that. -} -- -- We allow duplication of trivial expressions: registers (both local and -- global) and literals. -- -isTrivial :: CmmExpr -> Bool -isTrivial (CmmReg _) = True -isTrivial (CmmLit _) = True -isTrivial _ = False +isTrivial :: DynFlags -> CmmExpr -> Bool +isTrivial _ (CmmReg (CmmLocal _)) = True +isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?] + isJust (globalRegMaybe (targetPlatform dflags) r) + -- GlobalRegs that are loads from BaseReg are not trivial +isTrivial _ (CmmLit _) = True +isTrivial _ _ = False -- -- annotate each node with the set of registers live *after* the node @@ -401,7 +401,7 @@ tryToInline dflags live node assigs = go usages node [] assigs | cannot_inline = dont_inline | occurs_none = discard -- Note [discard during inlining] | occurs_once = inline_and_discard - | isTrivial rhs = inline_and_keep + | isTrivial dflags rhs = inline_and_keep | otherwise = dont_inline where inline_and_discard = go usages' inl_node skipped rest @@ -650,6 +650,10 @@ data AbsMem -- perhaps we ought to have a special annotation for calls that can -- modify heap/stack memory. For now we just use the conservative -- definition here. +-- +-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and +-- therefore we should never float any memory operations across one of +-- these calls. bothMems :: AbsMem -> AbsMem -> AbsMem @@ -695,3 +699,91 @@ regAddr _ (CmmGlobal Hp) _ _ = HeapMem regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself regAddr _ _ _ _ = AnyMem + +{- +Note [Inline GlobalRegs?] + +Should we freely inline GlobalRegs? + +Actually it doesn't make a huge amount of difference either way, so we +*do* currently treat GlobalRegs as "trivial" and inline them +everywhere, but for what it's worth, here is what I discovered when I +(SimonM) looked into this: + +Common sense says we should not inline GlobalRegs, because when we +have + + x = R1 + +the register allocator will coalesce this assignment, generating no +code, and simply record the fact that x is bound to $rbx (or +whatever). Furthermore, if we were to sink this assignment, then the +range of code over which R1 is live increases, and the range of code +over which x is live decreases. All things being equal, it is better +for x to be live than R1, because R1 is a fixed register whereas x can +live in any register. So we should neither sink nor inline 'x = R1'. + +However, not inlining GlobalRegs can have surprising +consequences. e.g. (cgrun020) + + c3EN: + _s3DB::P64 = R1; + _c3ES::P64 = _s3DB::P64 & 7; + if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV; + c3EU: + _s3DD::P64 = P64[_s3DB::P64 + 6]; + _s3DE::P64 = P64[_s3DB::P64 + 14]; + I64[Sp - 8] = c3F0; + R1 = _s3DE::P64; + P64[Sp] = _s3DD::P64; + +inlining the GlobalReg gives: + + c3EN: + if (R1 & 7 >= 2) goto c3EU; else goto c3EV; + c3EU: + I64[Sp - 8] = c3F0; + _s3DD::P64 = P64[R1 + 6]; + R1 = P64[R1 + 14]; + P64[Sp] = _s3DD::P64; + +but if we don't inline the GlobalReg, instead we get: + + _s3DB::P64 = R1; + if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV; + c3EU: + I64[Sp - 8] = c3F0; + R1 = P64[_s3DB::P64 + 14]; + P64[Sp] = P64[_s3DB::P64 + 6]; + +This looks better - we managed to inline _s3DD - but in fact it +generates an extra reg-reg move: + +.Lc3EU: + movq $c3F0_info,-8(%rbp) + movq %rbx,%rax + movq 14(%rbx),%rbx + movq 6(%rax),%rax + movq %rax,(%rbp) + +because _s3DB is now live across the R1 assignment, we lost the +benefit of coalescing. + +Who is at fault here? Perhaps if we knew that _s3DB was an alias for +R1, then we would not sink a reference to _s3DB past the R1 +assignment. Or perhaps we *should* do that - we might gain by sinking +it, despite losing the coalescing opportunity. + +Sometimes not inlining global registers wins by virtue of the rule +about not inlining into arguments of a foreign call, e.g. (T7163) this +is what happens when we inlined F1: + + _s3L2::F32 = F1; + _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32); + (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32); + +but if we don't inline F1: + + (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32, + 10.0 :: W32)); +-} |