diff options
-rw-r--r-- | compiler/GHC/Core/Opt/OccurAnal.hs | 16 | ||||
-rw-r--r-- | compiler/GHC/Driver/Config/StgToCmm.hs | 1 | ||||
-rw-r--r-- | compiler/GHC/Driver/Flags.hs | 2 | ||||
-rw-r--r-- | compiler/GHC/Driver/Session.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Bind.hs | 9 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Closure.hs | 31 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Config.hs | 1 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Expr.hs | 110 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Heap.hs | 32 | ||||
-rw-r--r-- | compiler/GHC/StgToCmm/Monad.hs | 17 | ||||
-rw-r--r-- | docs/users_guide/using-optimisation.rst | 9 |
11 files changed, 30 insertions, 201 deletions
diff --git a/compiler/GHC/Core/Opt/OccurAnal.hs b/compiler/GHC/Core/Opt/OccurAnal.hs index f296bc77ae..b6e58f0df9 100644 --- a/compiler/GHC/Core/Opt/OccurAnal.hs +++ b/compiler/GHC/Core/Opt/OccurAnal.hs @@ -809,12 +809,10 @@ So instead it would be good to *loopify* and transform the letrec `next` to And now the RHS of the non-recursive `next` is quite cheap to duplicate and since there is only one syntactic occurrence it is quite likely to inline, so we -get a tight loop. Besides, if it *doesn't* inline, the code we get is no worse. +get a tight loop. -(You might think that the loopified form has unconditionally favorable -operational properties even if the let is not inlined, because a jump is cheaper -than a full function call. But we have Note [Self-recursive tail calls] that -makes sure that the pre-loopified code actually has the same performance.) +Even if `next` *doesn't* inline, the code we get is much better: the closure for +`next` is smaller and the recursive call sequence can turn into a jump. We implement this *transformation* in the Occurrence Analyser because it is has all the necessary information at hand in `occAnalRec` while the Simplifier @@ -832,6 +830,14 @@ regression tests T13966, T14287, T22227 (#14068 has no reproducer). SpecConstr doesn't much like the loopified form, though. Hence it implements Note [Denesting non-recursive let bindings]. + +Some historic notes: Loopification was first described in "Low-level code +optimizations in the Glasgow Haskell Compiler" by Krzysztof Woś, though we use +different approach. Krzysztof performed his optimization at the Cmm level, +whereas we perform ours at the Core level (already). Historically, we did +loopification during StgToCmm (d61c3ac) but doing it in Core with join points +enables subsequent optimisations such as inlining `next` in the example above +and makes the old implementation superfluous. -} diff --git a/compiler/GHC/Driver/Config/StgToCmm.hs b/compiler/GHC/Driver/Config/StgToCmm.hs index 38e8f6684d..609c41f213 100644 --- a/compiler/GHC/Driver/Config/StgToCmm.hs +++ b/compiler/GHC/Driver/Config/StgToCmm.hs @@ -32,7 +32,6 @@ initStgToCmmConfig dflags mod = StgToCmmConfig , stgToCmmTickyDynThunk = gopt Opt_Ticky_Dyn_Thunk dflags , stgToCmmTickyTag = gopt Opt_Ticky_Tag dflags -- flags - , stgToCmmLoopification = gopt Opt_Loopification dflags , stgToCmmAlignCheck = gopt Opt_AlignmentSanitisation dflags , stgToCmmOptHpc = gopt Opt_Hpc dflags , stgToCmmFastPAPCalls = gopt Opt_FastPAPCalls dflags diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index 83d87b6898..2a0aebd224 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -274,7 +274,7 @@ data GeneralFlag | Opt_DictsStrict -- be strict in argument dictionaries | Opt_DmdTxDictSel -- ^ deprecated, no effect and behaviour is now default. -- Allowed switching of a special demand transformer for dictionary selectors - | Opt_Loopification -- See Note [Self-recursive tail calls] + | Opt_Loopification -- Deprecated | Opt_CfgBlocklayout -- ^ Use the cfg based block layout algorithm. | Opt_WeightlessBlocklayout -- ^ Layout based on last instruction per block. | Opt_CprAnal diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index cf6a5da5e3..5cc8bc7a39 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -3436,7 +3436,8 @@ fFlagsDeps = [ flagSpec "liberate-case" Opt_LiberateCase, flagHiddenSpec "llvm-tbaa" Opt_LlvmTBAA, flagHiddenSpec "llvm-fill-undef-with-garbage" Opt_LlvmFillUndefWithGarbage, - flagSpec "loopification" Opt_Loopification, + depFlagSpec "loopification" + Opt_Loopification "loopification now happens as part of Occurrence Analysis", flagSpec "block-layout-cfg" Opt_CfgBlocklayout, flagSpec "block-layout-weightless" Opt_WeightlessBlocklayout, flagSpec "omit-interface-pragmas" Opt_OmitInterfacePragmas, diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs index 177c3f2912..1ddc1675ff 100644 --- a/compiler/GHC/StgToCmm/Bind.hs +++ b/compiler/GHC/StgToCmm/Bind.hs @@ -41,7 +41,6 @@ import GHC.StgToCmm.Closure import GHC.StgToCmm.Foreign (emitPrimCall) import GHC.Cmm.Graph -import GHC.Cmm.BlockId import GHC.Cmm import GHC.Cmm.Info import GHC.Cmm.Utils @@ -511,12 +510,6 @@ closureCodeBody top_lvl bndr cl_info cc args@(arg0:_) body fv_details ; platform <- getPlatform ; let node_points = nodeMustPointToIt profile lf_info node' = if node_points then Just node else Nothing - ; loop_header_id <- newBlockId - -- Extend reader monad with information that - -- self-recursive tail calls can be optimized into local - -- jumps. See Note [Self-recursive tail calls] in GHC.StgToCmm.Expr. - ; withSelfLoop (bndr, loop_header_id, arg_regs) $ do - { -- Main payload ; entryHeapCheck cl_info node' arity arg_regs $ do { -- emit LDV code when profiling @@ -533,7 +526,7 @@ closureCodeBody top_lvl bndr cl_info cc args@(arg0:_) body fv_details ; when node_points $ load_fvs node lf_info fv_bindings ; checkFunctionArgTags (text "TagCheck failed - Argument to local function:" <> ppr bndr) bndr args ; void $ cgExpr body - }}} + }} } diff --git a/compiler/GHC/StgToCmm/Closure.hs b/compiler/GHC/StgToCmm/Closure.hs index fc76664d94..0b2640bb54 100644 --- a/compiler/GHC/StgToCmm/Closure.hs +++ b/compiler/GHC/StgToCmm/Closure.hs @@ -77,7 +77,6 @@ import GHC.Runtime.Heap.Layout import GHC.Cmm import GHC.Cmm.Utils import GHC.StgToCmm.Types -import GHC.StgToCmm.Sequel import GHC.Types.CostCentre import GHC.Cmm.BlockId @@ -95,7 +94,6 @@ import GHC.Types.Basic import GHC.Utils.Outputable import GHC.Utils.Panic import GHC.Utils.Panic.Plain -import GHC.Utils.Misc import Data.Coerce (coerce) import qualified Data.ByteString.Char8 as BS8 @@ -511,22 +509,9 @@ getCallMethod :: StgToCmmConfig -- tail calls using the same data constructor, -- JumpToIt. This saves us one case branch in -- cgIdApp - -> Maybe SelfLoopInfo -- can we perform a self-recursive tail-call -> CallMethod -getCallMethod cfg _ id _ n_args v_args _cg_loc (Just (self_loop_id, block_id, args)) - | stgToCmmLoopification cfg - , id == self_loop_id - , args `lengthIs` (n_args - v_args) - -- If these patterns match then we know that: - -- * loopification optimisation is turned on - -- * function is performing a self-recursive call in a tail position - -- * number of non-void parameters of the function matches functions arity. - -- See Note [Self-recursive tail calls] and Note [Void arguments in - -- self-recursive tail calls] in GHC.StgToCmm.Expr for more details - = JumpToIt block_id args - -getCallMethod cfg name id (LFReEntrant _ arity _ _) n_args _v_args _cg_loc _self_loop_info +getCallMethod cfg name id (LFReEntrant _ arity _ _) n_args _v_args _cg_loc | n_args == 0 -- No args at all && not (profileIsProfiling (stgToCmmProfile cfg)) -- See Note [Evaluating functions with profiling] in rts/Apply.cmm @@ -534,16 +519,16 @@ getCallMethod cfg name id (LFReEntrant _ arity _ _) n_args _v_args _cg_loc _self | n_args < arity = SlowCall -- Not enough args | otherwise = DirectEntry (enterIdLabel (stgToCmmPlatform cfg) name (idCafInfo id)) arity -getCallMethod _ _name _ LFUnlifted n_args _v_args _cg_loc _self_loop_info +getCallMethod _ _name _ LFUnlifted n_args _v_args _cg_loc = assert (n_args == 0) ReturnIt -getCallMethod _ _name _ (LFCon _) n_args _v_args _cg_loc _self_loop_info +getCallMethod _ _name _ (LFCon _) n_args _v_args _cg_loc = assert (n_args == 0) ReturnIt -- n_args=0 because it'd be ill-typed to apply a saturated -- constructor application to anything getCallMethod cfg name id (LFThunk _ _ updatable std_form_info is_fun) - n_args _v_args _cg_loc _self_loop_info + n_args _v_args _cg_loc | Just sig <- idTagSig_maybe id , isTaggedSig sig -- Infered to be already evaluated by Tag Inference @@ -581,7 +566,7 @@ getCallMethod cfg name id (LFThunk _ _ updatable std_form_info is_fun) updatable) 0 -- Imported(Unknown) Ids -getCallMethod cfg name id (LFUnknown might_be_a_function) n_args _v_args _cg_locs _self_loop_info +getCallMethod cfg name id (LFUnknown might_be_a_function) n_args _v_args _cg_locs | n_args == 0 , Just sig <- idTagSig_maybe id , isTaggedSig sig -- Infered to be already evaluated by Tag Inference @@ -598,14 +583,14 @@ getCallMethod cfg name id (LFUnknown might_be_a_function) n_args _v_args _cg_loc EnterIt -- Not a function -- TODO: Redundant with above match? --- getCallMethod _ name _ (LFUnknown False) n_args _v_args _cg_loc _self_loop_info +-- getCallMethod _ name _ (LFUnknown False) n_args _v_args _cg_loc -- = assertPpr (n_args == 0) (ppr name <+> ppr n_args) -- EnterIt -- Not a function -getCallMethod _ _name _ LFLetNoEscape _n_args _v_args (LneLoc blk_id lne_regs) _self_loop_info +getCallMethod _ _name _ LFLetNoEscape _n_args _v_args (LneLoc blk_id lne_regs) = JumpToIt blk_id lne_regs -getCallMethod _ _ _ _ _ _ _ _ = panic "Unknown call method" +getCallMethod _ _ _ _ _ _ _ = panic "Unknown call method" ----------------------------------------------------------------------------- -- Data types for closure information diff --git a/compiler/GHC/StgToCmm/Config.hs b/compiler/GHC/StgToCmm/Config.hs index f2bd349ae7..e5a91b0a7f 100644 --- a/compiler/GHC/StgToCmm/Config.hs +++ b/compiler/GHC/StgToCmm/Config.hs @@ -43,7 +43,6 @@ data StgToCmmConfig = StgToCmmConfig -- dynamic thunks , stgToCmmTickyTag :: !Bool -- ^ True indicates ticky will count number of avoided tag checks by tag inference. ---------------------------------- Flags -------------------------------------- - , stgToCmmLoopification :: !Bool -- ^ Loopification enabled (cf @-floopification@) , stgToCmmAlignCheck :: !Bool -- ^ Insert alignment check (cf @-falignment-sanitisation@) , stgToCmmOptHpc :: !Bool -- ^ perform code generation for code coverage , stgToCmmFastPAPCalls :: !Bool -- ^ diff --git a/compiler/GHC/StgToCmm/Expr.hs b/compiler/GHC/StgToCmm/Expr.hs index fcf91b4509..a7b9677c8e 100644 --- a/compiler/GHC/StgToCmm/Expr.hs +++ b/compiler/GHC/StgToCmm/Expr.hs @@ -996,7 +996,6 @@ cgIdApp fun_id args = do platform <- getPlatform fun_info <- getCgIdInfo fun_id cfg <- getStgToCmmConfig - self_loop <- getSelfLoop let profile = stgToCmmProfile cfg fun_arg = StgVarArg fun_id fun_name = idName fun_id @@ -1004,7 +1003,7 @@ cgIdApp fun_id args = do lf_info = cg_lf fun_info n_args = length args v_args = length $ filter (isZeroBitTy . stgArgType) args - case getCallMethod cfg fun_name fun_id lf_info n_args v_args (cg_loc fun_info) self_loop of + case getCallMethod cfg fun_name fun_id lf_info n_args v_args (cg_loc fun_info) of -- A value in WHNF, so we can just return it. ReturnIt | isZeroBitTy (idType fun_id) -> emitReturn [] @@ -1052,113 +1051,6 @@ cgIdApp fun_id args = do ; emit (mkBranch blk_id) ; return AssignedDirectly } --- Note [Self-recursive tail calls] --- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- --- Self-recursive tail calls can be optimized into a local jump in the same --- way as let-no-escape bindings (see Note [What is a non-escaping let] in --- "GHC.CoreToStg"). Consider this: --- --- foo.info: --- a = R1 // calling convention --- b = R2 --- goto L1 --- L1: ... --- ... --- ... --- L2: R1 = x --- R2 = y --- call foo(R1,R2) --- --- Instead of putting x and y into registers (or other locations required by the --- calling convention) and performing a call we can put them into local --- variables a and b and perform jump to L1: --- --- foo.info: --- a = R1 --- b = R2 --- goto L1 --- L1: ... --- ... --- ... --- L2: a = x --- b = y --- goto L1 --- --- This can be done only when function is calling itself in a tail position --- and only if the call passes number of parameters equal to function's arity. --- Note that this cannot be performed if a function calls itself with a --- continuation. --- --- This in fact implements optimization known as "loopification". It was --- described in "Low-level code optimizations in the Glasgow Haskell Compiler" --- by Krzysztof Woś, though we use different approach. Krzysztof performed his --- optimization at the Cmm level, whereas we perform ours during code generation --- (Stg-to-Cmm pass) essentially making sure that optimized Cmm code is --- generated in the first place. --- --- Implementation is spread across a couple of places in the code: --- --- * FCode monad stores additional information in its reader environment --- (stgToCmmSelfLoop field). This information tells us which function can --- tail call itself in an optimized way (it is the function currently --- being compiled), what is the label of a loop header (L1 in example above) --- and information about local registers in which we should arguments --- before making a call (this would be a and b in example above). --- --- * Whenever we are compiling a function, we set that information to reflect --- the fact that function currently being compiled can be jumped to, instead --- of called. This is done in closureCodyBody in GHC.StgToCmm.Bind. --- --- * We also have to emit a label to which we will be jumping. We make sure --- that the label is placed after a stack check but before the heap --- check. The reason is that making a recursive tail-call does not increase --- the stack so we only need to check once. But it may grow the heap, so we --- have to repeat the heap check in every self-call. This is done in --- do_checks in GHC.StgToCmm.Heap. --- --- * When we begin compilation of another closure we remove the additional --- information from the environment. This is done by forkClosureBody --- in GHC.StgToCmm.Monad. Other functions that duplicate the environment - --- forkLneBody, forkAlts, codeOnly - duplicate that information. In other --- words, we only need to clean the environment of the self-loop information --- when compiling right hand side of a closure (binding). --- --- * When compiling a call (cgIdApp) we use getCallMethod to decide what kind --- of call will be generated. getCallMethod decides to generate a self --- recursive tail call when (a) environment stores information about --- possible self tail-call; (b) that tail call is to a function currently --- being compiled; (c) number of passed non-void arguments is equal to --- function's arity. (d) loopification is turned on via -floopification --- command-line option. --- --- * Command line option to turn loopification on and off is implemented in --- DynFlags, then passed to StgToCmmConfig for this phase. --- --- --- Note [Void arguments in self-recursive tail calls] --- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- --- State# tokens can get in the way of the loopification optimization as seen in --- #11372. Consider this: --- --- foo :: [a] --- -> (a -> State# s -> (# State s, Bool #)) --- -> State# s --- -> (# State# s, Maybe a #) --- foo [] f s = (# s, Nothing #) --- foo (x:xs) f s = case f x s of --- (# s', b #) -> case b of --- True -> (# s', Just x #) --- False -> foo xs f s' --- --- We would like to compile the call to foo as a local jump instead of a call --- (see Note [Self-recursive tail calls]). However, the generated function has --- an arity of 2 while we apply it to 3 arguments, one of them being of void --- type. Thus, we mustn't count arguments of void type when checking whether --- we can turn a call into a self-recursive jump. --- - emitEnter :: CmmExpr -> FCode ReturnKind emitEnter fun = do { platform <- getPlatform diff --git a/compiler/GHC/StgToCmm/Heap.hs b/compiler/GHC/StgToCmm/Heap.hs index a7e7f23e9d..88d48c1897 100644 --- a/compiler/GHC/StgToCmm/Heap.hs +++ b/compiler/GHC/StgToCmm/Heap.hs @@ -635,15 +635,6 @@ do_checks mb_stk_hwm checkYield mb_alloc_lit do_gc = do Just stk_hwm -> tickyStackCheck >> (emit =<< mkCmmIfGoto' (sp_oflo stk_hwm) gc_id (Just False) ) - -- Emit new label that might potentially be a header - -- of a self-recursive tail call. - -- See Note [Self-recursive loop header]. - self_loop_info <- getSelfLoop - case self_loop_info of - Just (_, loop_header_id, _) - | checkYield && isJust mb_stk_hwm -> emitLabel loop_header_id - _otherwise -> return () - if isJust mb_alloc_lit then do tickyHeapCheck @@ -667,26 +658,3 @@ do_checks mb_stk_hwm checkYield mb_alloc_lit do_gc = do -- stack check succeeds. Otherwise we might end up -- with slop at the end of the current block, which can -- confuse the LDV profiler. - --- Note [Self-recursive loop header] --- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- Self-recursive loop header is required by loopification optimization (See --- Note [Self-recursive tail calls] in GHC.StgToCmm.Expr). We emit it if: --- --- 1. There is information about self-loop in the FCode environment. We don't --- check the binder (first component of the self_loop_info) because we are --- certain that if the self-loop info is present then we are compiling the --- binder body. Reason: the only possible way to get here with the --- self_loop_info present is from closureCodeBody. --- --- 2. checkYield && isJust mb_stk_hwm. checkYield tells us that it is possible --- to preempt the heap check (see #367 for motivation behind this check). It --- is True for heap checks placed at the entry to a function and --- let-no-escape heap checks but false for other heap checks (eg. in case --- alternatives or created from hand-written high-level Cmm). The second --- check (isJust mb_stk_hwm) is true for heap checks at the entry to a --- function and some heap checks created in hand-written Cmm. Otherwise it --- is Nothing. In other words the only situation when both conditions are --- true is when compiling stack and heap checks at the entry to a --- function. This is the only situation when we want to emit a self-loop --- label. diff --git a/compiler/GHC/StgToCmm/Monad.hs b/compiler/GHC/StgToCmm/Monad.hs index 9f9d292937..50043bc01f 100644 --- a/compiler/GHC/StgToCmm/Monad.hs +++ b/compiler/GHC/StgToCmm/Monad.hs @@ -54,7 +54,7 @@ module GHC.StgToCmm.Monad ( getModuleName, -- ideally we wouldn't export these, but some other modules access internal state - getState, setState, getSelfLoop, withSelfLoop, getStgToCmmConfig, + getState, setState, getStgToCmmConfig, -- more localised access to monad state CgIdInfo(..), @@ -296,8 +296,6 @@ data FCodeState = -- else the RTS will deadlock _and_ also experience a severe -- performance degradation , fcs_sequel :: !Sequel -- ^ What to do at end of basic block - , fcs_selfloop :: Maybe SelfLoopInfo -- ^ Which tail calls can be compiled as local jumps? - -- See Note [Self-recursive tail calls] in GHC.StgToCmm.Expr , fcs_ticky :: !CLabel -- ^ Destination for ticky counts , fcs_tickscope :: !CmmTickScope -- ^ Tick scope for new blocks & ticks } @@ -455,7 +453,6 @@ initFCodeState :: Platform -> FCodeState initFCodeState p = MkFCodeState { fcs_upframeoffset = platformWordSizeInBytes p , fcs_sequel = Return - , fcs_selfloop = Nothing , fcs_ticky = mkTopTickyCtrLabel , fcs_tickscope = GlobalScope } @@ -467,22 +464,13 @@ getFCodeState = FCode $ \_ fstate state -> (fstate,state) withFCodeState :: FCode a -> FCodeState -> FCode a withFCodeState (FCode fcode) fst = FCode $ \cfg _ state -> fcode cfg fst state -getSelfLoop :: FCode (Maybe SelfLoopInfo) -getSelfLoop = fcs_selfloop <$> getFCodeState - -withSelfLoop :: SelfLoopInfo -> FCode a -> FCode a -withSelfLoop self_loop code = do - fstate <- getFCodeState - withFCodeState code (fstate {fcs_selfloop = Just self_loop}) - -- ---------------------------------------------------------------------------- -- Get/set the end-of-block info withSequel :: Sequel -> FCode a -> FCode a withSequel sequel code = do { fstate <- getFCodeState - ; withFCodeState code (fstate { fcs_sequel = sequel - , fcs_selfloop = Nothing }) } + ; withFCodeState code (fstate { fcs_sequel = sequel }) } getSequel :: FCode Sequel getSequel = fcs_sequel <$> getFCodeState @@ -578,7 +566,6 @@ forkClosureBody body_code ; state <- getState ; let fcs = fstate { fcs_sequel = Return , fcs_upframeoffset = platformWordSizeInBytes platform - , fcs_selfloop = Nothing } fork_state_in = (initCgState us) { cgs_binds = cgs_binds state } ((),fork_state_out) = doFCode body_code cfg fcs fork_state_in diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst index d3ca68a1df..817fd2ca3f 100644 --- a/docs/users_guide/using-optimisation.rst +++ b/docs/users_guide/using-optimisation.rst @@ -649,17 +649,16 @@ by saying ``-fno-wombat``. Set the size threshold for the liberate-case transformation. .. ghc-flag:: -floopification - :shortdesc: Turn saturated self-recursive tail-calls into local jumps in the - generated assembly. Implied by :ghc-flag:`-O`. + :shortdesc: *(deprecated)* Does nothing :type: dynamic :reverse: -fno-loopification :category: :default: on - When this optimisation is enabled the code generator will turn all - self-recursive saturated tail calls into local jumps rather than - function calls. + This flag has no effect since GHC 9.6 - its behavior is always on. + It used to instruct the code generator to turn all self-recursive saturated + tail calls into local jumps rather than function calls. .. ghc-flag:: -fllvm-pass-vectors-in-regs :shortdesc: *(deprecated)* Does nothing |