diff options
author | Simon Marlow <marlowsd@gmail.com> | 2012-02-03 15:03:06 +0000 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2012-02-08 15:50:43 +0000 |
commit | 76999b605423f530ec17562d772eda1c1672db53 (patch) | |
tree | 3ca1208d02ca753ffe07e13fe1bdbc1388d7e9cf /compiler/codeGen | |
parent | cd38928495f9186646273432151259f3d654b7e2 (diff) | |
download | haskell-76999b605423f530ec17562d772eda1c1672db53.tar.gz |
New stack layout algorithm
Also:
- improvements to code generation: push slow-call continuations
on the stack instead of generating explicit continuations
- remove unused CmmInfo wrapper type (replace with CmmInfoTable)
- squash Area and AreaId together, remove now-unused RegSlot
- comment out old unused stack-allocation code that no longer
compiles after removal of RegSlot
Diffstat (limited to 'compiler/codeGen')
-rw-r--r-- | compiler/codeGen/CgInfoTbls.hs | 28 | ||||
-rw-r--r-- | compiler/codeGen/CgMonad.lhs | 4 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmBind.hs | 2 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmExpr.hs | 73 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmForeign.hs | 2 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmHeap.hs | 4 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmLayout.hs | 164 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmMonad.hs | 18 |
8 files changed, 222 insertions, 73 deletions
diff --git a/compiler/codeGen/CgInfoTbls.hs b/compiler/codeGen/CgInfoTbls.hs index 25ba154d12..ed5c5261d7 100644 --- a/compiler/codeGen/CgInfoTbls.hs +++ b/compiler/codeGen/CgInfoTbls.hs @@ -67,10 +67,9 @@ emitClosureCodeAndInfoTable cl_info args body -- Convert from 'ClosureInfo' to 'CmmInfo'. -- Not used for return points. (The 'smRepClosureTypeInt' call would panic.) -mkCmmInfo :: ClosureInfo -> FCode CmmInfo +mkCmmInfo :: ClosureInfo -> FCode CmmInfoTable mkCmmInfo cl_info - = return (CmmInfo gc_target Nothing $ - CmmInfoTable { cit_lbl = infoTableLabelFromCI cl_info, + = return (CmmInfoTable { cit_lbl = infoTableLabelFromCI cl_info, cit_rep = closureSMRep cl_info, cit_prof = prof, cit_srt = closureSRT cl_info }) @@ -80,14 +79,6 @@ mkCmmInfo cl_info ty_descr_w8 = stringToWord8s (closureTypeDescr cl_info) val_descr_w8 = stringToWord8s (closureValDescr cl_info) - -- The gc_target is to inform the CPS pass when it inserts a stack check. - -- Since that pass isn't used yet we'll punt for now. - -- When the CPS pass is fully integrated, this should - -- be replaced by the label that any heap check jumped to, - -- so that branch can be shared by both the heap (from codeGen) - -- and stack checks (from the CPS pass). - gc_target = panic "TODO: gc_target" - ------------------------------------------------------------------------- -- -- Generating the info table and code for a return point @@ -106,8 +97,7 @@ emitReturnTarget name stmts ; blks <- cgStmtsToBlocks stmts ; frame <- mkStackLayout ; let smrep = mkStackRep (mkLiveness frame) - info = CmmInfo gc_target Nothing info_tbl - info_tbl = CmmInfoTable { cit_lbl = info_lbl + info = CmmInfoTable { cit_lbl = info_lbl , cit_prof = NoProfilingInfo , cit_rep = smrep , cit_srt = srt_info } @@ -119,14 +109,6 @@ emitReturnTarget name stmts info_lbl = mkReturnInfoLabel uniq entry_lbl = mkReturnPtLabel uniq - -- The gc_target is to inform the CPS pass when it inserts a stack check. - -- Since that pass isn't used yet we'll punt for now. - -- When the CPS pass is fully integrated, this should - -- be replaced by the label that any heap check jumped to, - -- so that branch can be shared by both the heap (from codeGen) - -- and stack checks (from the CPS pass). - gc_target = panic "TODO: gc_target" - -- Build stack layout information from the state of the 'FCode' monad. -- Should go away once 'codeGen' starts using the CPS conversion -- pass to handle the stack. Until then, this is really just @@ -378,8 +360,8 @@ funInfoTable info_ptr emitInfoTableAndCode :: CLabel -- Label of entry or ret - -> CmmInfo -- ...the info table - -> [CmmFormal] -- ...args + -> CmmInfoTable -- ...the info table + -> [CmmFormal] -- ...args -> [CmmBasicBlock] -- ...and body -> Code diff --git a/compiler/codeGen/CgMonad.lhs b/compiler/codeGen/CgMonad.lhs index 59f6accf9d..6e164ce9ee 100644 --- a/compiler/codeGen/CgMonad.lhs +++ b/compiler/codeGen/CgMonad.lhs @@ -728,7 +728,7 @@ emitDecl decl = do { state <- getState ; setState $ state { cgs_tops = cgs_tops state `snocOL` decl } } -emitProc :: CmmInfo -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code +emitProc :: CmmInfoTable -> CLabel -> [CmmFormal] -> [CmmBasicBlock] -> Code emitProc info lbl [] blocks = do { let proc_block = CmmProc info lbl (ListGraph blocks) ; state <- getState @@ -740,7 +740,7 @@ emitSimpleProc :: CLabel -> Code -> Code emitSimpleProc lbl code = do { stmts <- getCgStmts code ; blks <- cgStmtsToBlocks stmts - ; emitProc (CmmInfo Nothing Nothing CmmNonInfoTable) lbl [] blks } + ; emitProc CmmNonInfoTable lbl [] blks } -- Get all the CmmTops (there should be no stmts) -- Return a single Cmm which may be split from other Cmms by diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs index 724f28d142..0222299ff2 100644 --- a/compiler/codeGen/StgCmmBind.hs +++ b/compiler/codeGen/StgCmmBind.hs @@ -596,7 +596,7 @@ pushUpdateFrame es body offset <- foldM push updfr es withUpdFrameOff offset body where push off e = - do emitStore (CmmStackSlot (CallArea Old) base) e + do emitStore (CmmStackSlot Old base) e return base where base = off + widthInBytes (cmmExprWidth e) diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs index 3b12b2a4b7..fe41de83fa 100644 --- a/compiler/codeGen/StgCmmExpr.hs +++ b/compiler/codeGen/StgCmmExpr.hs @@ -532,16 +532,9 @@ cgTailCall fun_id fun_info args = do ReturnIt -> emitReturn [fun] -- ToDo: does ReturnIt guarantee tagged? EnterIt -> ASSERT( null args ) -- Discarding arguments - do { let entry = entryCode (closureInfoPtr fun) - ; [ret,call] <- forkAlts [ - getCode $ - emitReturn [fun], -- Is tagged; no need to untag - getCode $ do -- Not tagged - emitCall (NativeNodeCall, NativeReturn) entry [fun] - ] - ; emit =<< mkCmmIfThenElse (cmmIsTagged fun) ret call } - - SlowCall -> do -- A slow function call via the RTS apply routines + emitEnter fun + + SlowCall -> do -- A slow function call via the RTS apply routines { tickySlowCall lf_info args ; emitComment $ mkFastString "slowCall" ; slowCall fun args } @@ -565,6 +558,66 @@ cgTailCall fun_id fun_info args = do node_points = nodeMustPointToIt lf_info +emitEnter :: CmmExpr -> FCode () +emitEnter fun = do + { adjustHpBackwards + ; sequel <- getSequel + ; updfr_off <- getUpdFrameOff + ; case sequel of + -- For a return, we have the option of generating a tag-test or + -- not. If the value is tagged, we can return directly, which + -- is quicker than entering the value. This is a code + -- size/speed trade-off: when optimising for speed rather than + -- size we could generate the tag test. + -- + -- Right now, we do what the old codegen did, and omit the tag + -- test, just generating an enter. + Return _ -> do + { let entry = entryCode $ closureInfoPtr $ CmmReg nodeReg + ; emit $ mkForeignJump NativeNodeCall entry + [cmmUntag fun] updfr_off + } + + -- The result will be scrutinised in the sequel. This is where + -- we generate a tag-test to avoid entering the closure if + -- possible. + -- + -- The generated code will be something like this: + -- + -- R1 = fun -- copyout + -- if (fun & 7 != 0) goto Lcall else goto Lret + -- Lcall: + -- call [fun] returns to Lret + -- Lret: + -- fun' = R1 -- copyin + -- ... + -- + -- Note in particular that the label Lret is used as a + -- destination by both the tag-test and the call. This is + -- becase Lret will necessarily be a proc-point, and we want to + -- ensure that we generate only one proc-point for this + -- sequence. + -- + AssignTo res_regs _ -> do + { lret <- newLabelC + ; lcall <- newLabelC + ; let area = Young lret + ; let (off, copyin) = copyInOflow NativeReturn area res_regs + (outArgs, copyout) = copyOutOflow NativeNodeCall Call area + [fun] updfr_off (0,[]) + ; let entry = entryCode (closureInfoPtr fun) + the_call = toCall entry (Just lret) updfr_off off outArgs + ; emit $ + copyout <*> + mkCbranch (cmmIsTagged fun) lret lcall <*> + outOfLine lcall the_call <*> + mkLabel lret <*> + copyin + } + } + + + {- Note [case on Bool] ~~~~~~~~~~~~~~~~~~~ A case on a Boolean value does two things: diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs index f4be622092..3580481043 100644 --- a/compiler/codeGen/StgCmmForeign.hs +++ b/compiler/codeGen/StgCmmForeign.hs @@ -184,7 +184,7 @@ emitSaveThreadState :: BlockId -> FCode () emitSaveThreadState bid = do -- CurrentTSO->stackobj->sp = Sp; emitStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord) stack_SP) - (CmmStackSlot (CallArea (Young bid)) (widthInBytes (typeWidth gcWord))) + (CmmStackSlot (Young bid) (widthInBytes (typeWidth gcWord))) emit closeNursery -- and save the current cost centre stack in the TSO when profiling: when opt_SccProfilingOn $ diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs index 2b0b6f895e..68d078fb28 100644 --- a/compiler/codeGen/StgCmmHeap.hs +++ b/compiler/codeGen/StgCmmHeap.hs @@ -418,8 +418,8 @@ altHeapCheck regs code gc_call sp = case rts_label regs of - Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp - Nothing -> mkCall generic_gc (GC, GC) [] [] sp + Just gc -> mkCall (CmmLit gc) (GC, GC) regs reg_exprs sp (0,[]) + Nothing -> mkCall generic_gc (GC, GC) [] [] sp (0,[]) rts_label [reg] | isGcPtrType ty = Just (mkGcLabel "stg_gc_unpt_r1") diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs index 0299bc0f96..9ee9192794 100644 --- a/compiler/codeGen/StgCmmLayout.hs +++ b/compiler/codeGen/StgCmmLayout.hs @@ -15,7 +15,7 @@ module StgCmmLayout ( mkArgDescr, - emitCall, emitReturn, + emitCall, emitReturn, adjustHpBackwards, emitClosureProcAndInfoTable, emitClosureAndInfoTable, @@ -41,10 +41,12 @@ import StgCmmEnv import StgCmmTicky import StgCmmMonad import StgCmmUtils +import StgCmmProf import MkGraph import SMRep import Cmm +import CmmUtils import CLabel import StgSyn import Id @@ -53,6 +55,7 @@ import TyCon ( PrimRep(..) ) import BasicTypes ( Arity ) import DynFlags import StaticFlags +import Module import Constants import Util @@ -93,10 +96,31 @@ emitCall convs@(callConv, _) fun args ; updfr_off <- getUpdFrameOff ; emitComment $ mkFastString ("emitCall: " ++ show sequel) ; case sequel of - Return _ -> emit (mkForeignJump callConv fun args updfr_off) - AssignTo res_regs _ -> emit =<< mkCall fun convs res_regs args updfr_off + Return _ -> + emit $ mkForeignJump callConv fun args updfr_off + AssignTo res_regs _ -> + emit =<< mkCall fun convs res_regs args updfr_off (0,[]) } +emitCallWithExtraStack + :: (Convention, Convention) -> CmmExpr -> [CmmExpr] + -> (ByteOff, [(CmmExpr,ByteOff)]) -> FCode () +-- (cgCall fun args) makes a call to the entry-code of 'fun', +-- passing 'args', and returning the results to the current sequel +emitCallWithExtraStack convs@(callConv, _) fun args extra_stack + = do { adjustHpBackwards + ; sequel <- getSequel + ; updfr_off <- getUpdFrameOff + ; emitComment $ mkFastString ("emitCallWithExtraStack: " ++ show sequel) + ; case sequel of + Return _ -> + emit $ mkForeignJumpExtra callConv fun args updfr_off extra_stack + AssignTo res_regs _ -> do + emit =<< mkCall fun convs res_regs args updfr_off extra_stack + } + + + adjustHpBackwards :: FCode () -- This function adjusts and heap pointers just before a tail call or -- return. At a call or return, the virtual heap pointer may be less @@ -128,6 +152,19 @@ adjustHpBackwards -- Making calls: directCall and slowCall ------------------------------------------------------------------------- +-- General plan is: +-- - we'll make *one* fast call, either to the function itself +-- (directCall) or to stg_ap_<pat>_fast (slowCall) +-- Any left-over arguments will be pushed on the stack, +-- +-- e.g. Sp[old+8] = arg1 +-- Sp[old+16] = arg2 +-- Sp[old+32] = stg_ap_pp_info +-- R2 = arg3 +-- R3 = arg4 +-- call f() return to Nothing updfr_off: 32 + + directCall :: CLabel -> Arity -> [StgArg] -> FCode () -- (directCall f n args) -- calls f(arg1, ..., argn), and applies the result to the remaining args @@ -140,8 +177,19 @@ directCall lbl arity stg_args slowCall :: CmmExpr -> [StgArg] -> FCode () -- (slowCall fun args) applies fun to args, returning the results to Sequel slowCall fun stg_args - = do { cmm_args <- getNonVoidArgAmodes stg_args - ; slow_call fun cmm_args (argsReps stg_args) } + = do { dflags <- getDynFlags + ; cmm_args <- getNonVoidArgAmodes stg_args + ; let platform = targetPlatform dflags + ; call <- getCode $ direct_call "slow_call" + (mkRtsApFastLabel rts_fun) arity cmm_args reps + ; emitComment $ mkFastString ("slow_call for " ++ + showSDoc (pprPlatform platform fun) ++ + " with pat " ++ showSDoc (ftext rts_fun)) + ; emit (mkAssign nodeReg fun <*> call) + } + where + reps = argsReps stg_args + (rts_fun, arity) = slowCallPattern reps -------------- direct_call :: String -> CLabel -> Arity -> [CmmExpr] -> [ArgRep] -> FCode () @@ -149,7 +197,7 @@ direct_call :: String -> CLabel -> Arity -> [CmmExpr] -> [ArgRep] -> FCode () -- the args exclude the void ones -- NB2: 'arity' refers to the *reps* direct_call caller lbl arity args reps - | debugIsOn && arity > length reps -- Too few args + | debugIsOn && arity > length reps -- Too few args = do -- Caller should ensure that there enough args! dflags <- getDynFlags let platform = targetPlatform dflags @@ -157,33 +205,77 @@ direct_call caller lbl arity args reps <+> pprPlatform platform lbl <+> ppr (length reps) <+> pprPlatform platform args <+> ppr reps ) - | null rest_reps -- Precisely the right number of arguments + + | null rest_reps -- Precisely the right number of arguments = emitCall (NativeDirectCall, NativeReturn) target args - | otherwise -- Over-saturated call + | otherwise -- Note [over-saturated calls] = ASSERT( arity == length initial_reps ) - do { pap_id <- newTemp gcWord - ; withSequel (AssignTo [pap_id] True) - (emitCall (NativeDirectCall, NativeReturn) target fast_args) - ; slow_call (CmmReg (CmmLocal pap_id)) - rest_args rest_reps } + emitCallWithExtraStack (NativeDirectCall, NativeReturn) + target fast_args (mkStkOffsets stack_args) where target = CmmLit (CmmLabel lbl) (initial_reps, rest_reps) = splitAt arity reps arg_arity = count isNonV initial_reps (fast_args, rest_args) = splitAt arg_arity args - --------------- -slow_call :: CmmExpr -> [CmmExpr] -> [ArgRep] -> FCode () -slow_call fun args reps - = do dflags <- getDynFlags - let platform = targetPlatform dflags - call <- getCode $ direct_call "slow_call" (mkRtsApFastLabel rts_fun) arity args reps - emitComment $ mkFastString ("slow_call for " ++ showSDoc (pprPlatform platform fun) ++ - " with pat " ++ showSDoc (ftext rts_fun)) - emit (mkAssign nodeReg fun <*> call) + stack_args = slowArgs (zip rest_reps rest_args) + + +{- +Note [over-saturated calls] + +The natural thing to do for an over-saturated call would be to call +the function with the correct number of arguments, and then apply the +remaining arguments to the value returned, e.g. + + f a b c d (where f has arity 2) + --> + r = call f(a,b) + call r(c,d) + +but this entails + - saving c and d on the stack + - making a continuation info table + - at the continuation, loading c and d off the stack into regs + - finally, call r + +Note that since there are a fixed number of different r's +(e.g. stg_ap_pp_fast), we can also pre-compile continuations +that correspond to each of them, rather than generating a fresh +one for each over-saturated call. + +Not only does this generate much less code, it is faster too. We will +generate something like: + +Sp[old+16] = c +Sp[old+24] = d +Sp[old+32] = stg_ap_pp_info +call f(a,b) -- usual calling convention + +For the purposes of the CmmCall node, we count this extra stack as +just more arguments that we are passing on the stack (cml_args). +-} + +-- | 'slowArgs' takes a list of function arguments and prepares them for +-- pushing on the stack for "extra" arguments to a function which requires +-- fewer arguments than we currently have. +slowArgs :: [(ArgRep,CmmExpr)] -> [(ArgRep,CmmExpr)] +slowArgs [] = [] +slowArgs amodes + | opt_SccProfilingOn = save_cccs ++ this_pat ++ slowArgs rest + | otherwise = this_pat ++ slowArgs rest where - (rts_fun, arity) = slowCallPattern reps + (arg_pat, args, rest) = matchSlowPattern amodes + stg_ap_pat = mkCmmRetInfoLabel rtsPackageId arg_pat + this_pat = (N, mkLblExpr stg_ap_pat) : args + save_cccs = [(N, mkLblExpr save_cccs_lbl), (N, curCCS)] + save_cccs_lbl = mkCmmRetInfoLabel rtsPackageId (fsLit "stg_restore_cccs") + +matchSlowPattern :: [(ArgRep,CmmExpr)] + -> (FastString, [(ArgRep,CmmExpr)], [(ArgRep,CmmExpr)]) +matchSlowPattern amodes = (arg_pat, these, rest) + where (arg_pat, n) = slowCallPattern (map fst amodes) + (these, rest) = splitAt n amodes -- These cases were found to cover about 99% of all slow calls: slowCallPattern :: [ArgRep] -> (FastString, Arity) @@ -206,6 +298,30 @@ slowCallPattern [] = (fsLit "stg_ap_0", 0) ------------------------------------------------------------------------- +-- Fix the byte-offsets of a bunch of things to push on the stack + +-- This is used for pushing slow-call continuations. +-- See Note [over-saturated calls]. + +mkStkOffsets + :: [(ArgRep,CmmExpr)] -- things to make offsets for + -> ( ByteOff -- OUTPUTS: Topmost allocated word + , [(CmmExpr, ByteOff)] ) -- things with offsets (voids filtered out) +mkStkOffsets things + = loop 0 [] (reverse things) + where + loop offset offs [] = (offset,offs) + loop offset offs ((V,_):things) = loop offset offs things + -- ignore Void arguments + loop offset offs ((rep,thing):things) + = loop thing_off ((thing, thing_off):offs) things + where + thing_off = offset + argRepSizeW rep * wORD_SIZE + -- offset of thing is offset+size, because we're + -- growing the stack *downwards* as the offsets increase. + + +------------------------------------------------------------------------- -- Classifying arguments: ArgRep ------------------------------------------------------------------------- diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs index 6c5ab4c692..c64df7ecc5 100644 --- a/compiler/codeGen/StgCmmMonad.hs +++ b/compiler/codeGen/StgCmmMonad.hs @@ -611,7 +611,7 @@ emitLabel :: BlockId -> FCode () emitLabel id = emitCgStmt (CgLabel id) emitComment :: FastString -> FCode () -#ifdef DEBUG +#if 0 /* def DEBUG */ emitComment s = emitCgStmt (CgStmt (CmmComment s)) #else emitComment s = return () @@ -688,20 +688,18 @@ mkCmmIfThen e tbranch = do mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual] - -> UpdFrameOffset -> FCode CmmAGraph -mkCall f (callConv, retConv) results actuals updfr_off = do + -> UpdFrameOffset -> (ByteOff,[(CmmExpr,ByteOff)]) -> FCode CmmAGraph +mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do k <- newLabelC - let area = CallArea $ Young k + let area = Young k (off, copyin) = copyInOflow retConv area results - copyout = lastWithArgs Call area callConv actuals updfr_off - (toCall f (Just k) updfr_off off) + copyout = mkCallReturnsTo f callConv actuals k off updfr_off extra_stack return (copyout <*> mkLabel k <*> copyin) - mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset -> FCode CmmAGraph -mkCmmCall f results actuals - = mkCall f (NativeDirectCall, NativeReturn) results actuals +mkCmmCall f results actuals updfr_off + = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off (0,[]) mkSafeCall :: ForeignTarget -> [CmmFormal] -> [CmmActual] @@ -710,7 +708,7 @@ mkSafeCall :: ForeignTarget -> [CmmFormal] -> [CmmActual] mkSafeCall t fs as upd i = do k <- newLabelC return - ( mkStore (CmmStackSlot (CallArea (Young k)) (widthInBytes wordWidth)) + ( mkStore (CmmStackSlot (Young k) (widthInBytes wordWidth)) (CmmLit (CmmBlock k)) <*> mkLast (CmmForeignCall {tgt=t, res=fs, args=as, succ=k, updfr=upd, intrbl=i}) <*> mkLabel k) |