Merge branch 'master' into wip/ext-solverwip/ext-solver

author: Iavor S. Diatchki <iavor.diatchki@gmail.com> 2014-07-19 14:29:57 -0700
committer: Iavor S. Diatchki <iavor.diatchki@gmail.com> 2014-07-19 14:29:57 -0700
commit: 524634641c61ab42c555452f6f87119b27f6c331 (patch)
tree: f78d17bb6b09fb3b2e22cb4d93c2a3d45accc2d9 /compiler/cmm/CmmSink.hs
parent: 79ad1d20c5500e17ce5daaf93b171131669bddad (diff)
parent: c41b716d82b1722f909979d02a76e21e9b68886c (diff)
download: haskell-wip/ext-solver.tar.gz
1 files changed, 104 insertions, 12 deletions
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 187f4c47df..4dced9afd2 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -9,6 +9,7 @@ import BlockId
 import CmmLive
 import CmmUtils
 import Hoopl
+import CodeGen.Platform
 
 import DynFlags
 import UniqFM
@@ -16,6 +17,7 @@ import PprCmm ()
 
 import Data.List (partition)
 import qualified Data.Set as Set
+import Data.Maybe
 
 -- -----------------------------------------------------------------------------
 -- Sinking and inlining
@@ -197,7 +199,7 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
       drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
           where
             should_drop =  conflicts dflags a final_last
-                        || not (isTrivial rhs) && live_in_multi live_sets r
+                        || not (isTrivial dflags rhs) && live_in_multi live_sets r
                         || r `Set.member` live_in_joins
 
             live_sets' | should_drop = live_sets
@@ -219,26 +221,24 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
 
 -- small: an expression we don't mind duplicating
 isSmall :: CmmExpr -> Bool
-isSmall (CmmReg (CmmLocal _)) = True  -- not globals, we want to coalesce them instead* See below
+isSmall (CmmReg (CmmLocal _)) = True  --
 isSmall (CmmLit _) = True
 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
 isSmall (CmmRegOff (CmmLocal _) _) = True
 isSmall _ = False
-
-Coalesce global registers? What does that mean? We observed no decrease
-in performance comming from inlining of global registers, hence we do it now
-(see isTrivial function). Ideally we'd like to measure performance using
-some tool like perf or VTune and make decisions what to inline based on that.
 -}
 
 --
 -- We allow duplication of trivial expressions: registers (both local and
 -- global) and literals.
 --
-isTrivial :: CmmExpr -> Bool
-isTrivial (CmmReg _) = True
-isTrivial (CmmLit _) = True
-isTrivial _          = False
+isTrivial :: DynFlags -> CmmExpr -> Bool
+isTrivial _ (CmmReg (CmmLocal _)) = True
+isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
+   isJust (globalRegMaybe (targetPlatform dflags) r)
+   -- GlobalRegs that are loads from BaseReg are not trivial
+isTrivial _ (CmmLit _) = True
+isTrivial _ _          = False
 
 --
 -- annotate each node with the set of registers live *after* the node
@@ -401,7 +401,7 @@ tryToInline dflags live node assigs = go usages node [] assigs
    | cannot_inline           = dont_inline
    | occurs_none             = discard  -- Note [discard during inlining]
    | occurs_once             = inline_and_discard
-   | isTrivial rhs           = inline_and_keep
+   | isTrivial dflags rhs    = inline_and_keep
    | otherwise               = dont_inline
    where
         inline_and_discard = go usages' inl_node skipped rest
@@ -650,6 +650,10 @@ data AbsMem
 -- perhaps we ought to have a special annotation for calls that can
 -- modify heap/stack memory.  For now we just use the conservative
 -- definition here.
+--
+-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+-- therefore we should never float any memory operations across one of
+-- these calls.
 
 
 bothMems :: AbsMem -> AbsMem -> AbsMem
@@ -695,3 +699,91 @@ regAddr _      (CmmGlobal Hp) _ _ = HeapMem
 regAddr _      (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
 regAddr _      _ _ _ = AnyMem
+
+{-
+Note [Inline GlobalRegs?]
+
+Should we freely inline GlobalRegs?
+
+Actually it doesn't make a huge amount of difference either way, so we
+*do* currently treat GlobalRegs as "trivial" and inline them
+everywhere, but for what it's worth, here is what I discovered when I
+(SimonM) looked into this:
+
+Common sense says we should not inline GlobalRegs, because when we
+have
+
+  x = R1
+
+the register allocator will coalesce this assignment, generating no
+code, and simply record the fact that x is bound to $rbx (or
+whatever).  Furthermore, if we were to sink this assignment, then the
+range of code over which R1 is live increases, and the range of code
+over which x is live decreases.  All things being equal, it is better
+for x to be live than R1, because R1 is a fixed register whereas x can
+live in any register.  So we should neither sink nor inline 'x = R1'.
+
+However, not inlining GlobalRegs can have surprising
+consequences. e.g. (cgrun020)
+
+  c3EN:
+      _s3DB::P64 = R1;
+      _c3ES::P64 = _s3DB::P64 & 7;
+      if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      _s3DD::P64 = P64[_s3DB::P64 + 6];
+      _s3DE::P64 = P64[_s3DB::P64 + 14];
+      I64[Sp - 8] = c3F0;
+      R1 = _s3DE::P64;
+      P64[Sp] = _s3DD::P64;
+
+inlining the GlobalReg gives:
+
+  c3EN:
+      if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      I64[Sp - 8] = c3F0;
+      _s3DD::P64 = P64[R1 + 6];
+      R1 = P64[R1 + 14];
+      P64[Sp] = _s3DD::P64;
+
+but if we don't inline the GlobalReg, instead we get:
+
+      _s3DB::P64 = R1;
+      if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      I64[Sp - 8] = c3F0;
+      R1 = P64[_s3DB::P64 + 14];
+      P64[Sp] = P64[_s3DB::P64 + 6];
+
+This looks better - we managed to inline _s3DD - but in fact it
+generates an extra reg-reg move:
+
+.Lc3EU:
+        movq $c3F0_info,-8(%rbp)
+        movq %rbx,%rax
+        movq 14(%rbx),%rbx
+        movq 6(%rax),%rax
+        movq %rax,(%rbp)
+
+because _s3DB is now live across the R1 assignment, we lost the
+benefit of coalescing.
+
+Who is at fault here?  Perhaps if we knew that _s3DB was an alias for
+R1, then we would not sink a reference to _s3DB past the R1
+assignment.  Or perhaps we *should* do that - we might gain by sinking
+it, despite losing the coalescing opportunity.
+
+Sometimes not inlining global registers wins by virtue of the rule
+about not inlining into arguments of a foreign call, e.g. (T7163) this
+is what happens when we inlined F1:
+
+      _s3L2::F32 = F1;
+      _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
+      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(_c3O3::F32);
+
+but if we don't inline F1:
+
+      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
+                                                                                            10.0 :: W32));
+-}
author	Iavor S. Diatchki <iavor.diatchki@gmail.com>	2014-07-19 14:29:57 -0700
committer	Iavor S. Diatchki <iavor.diatchki@gmail.com>	2014-07-19 14:29:57 -0700
commit	524634641c61ab42c555452f6f87119b27f6c331 (patch)
tree	f78d17bb6b09fb3b2e22cb4d93c2a3d45accc2d9 /compiler/cmm/CmmSink.hs
parent	79ad1d20c5500e17ce5daaf93b171131669bddad (diff)
parent	c41b716d82b1722f909979d02a76e21e9b68886c (diff)
download	haskell-wip/ext-solver.tar.gz