diff options
| author | Edward Z. Yang <ezyang@mit.edu> | 2012-09-17 18:28:49 +0200 | 
|---|---|---|
| committer | Edward Z. Yang <ezyang@mit.edu> | 2012-09-26 13:46:57 -0700 | 
| commit | d3128bfc286002862e916296629a22f1ce987e4e (patch) | |
| tree | b819a5ba9bb6079775186726a603e96cbf26c9fb | |
| parent | 2145ffc452a64bae457a38276c81b60f22ddf161 (diff) | |
| download | haskell-d3128bfc286002862e916296629a22f1ce987e4e.tar.gz | |
Partially fix #367 by adding HpLim checks to entry with -fno-omit-yields.
The current fix is relatively dumb as far as where to add HpLim
checks: it will always perform a check unless we know that we're
returning from a closure or we are doing a non let-no-escape case
analysis.  The performance impact on the nofib suite looks like this:
            Min          +5.7%     -0.0%     -6.5%     -6.4%    -50.0%
            Max          +6.3%     +5.8%     +5.0%     +5.5%     +0.8%
 Geometric Mean          +6.2%     +0.1%     +0.5%     +0.5%     -0.8%
Overall, the executable bloat is the biggest problem, so we keep the old
omit-yields optimization on by default. Remember that if you need an
interruptibility guarantee, you need to recompile all of your libraries
with -fno-omit-yields.
A better fix would involve only inserting the yields necessary to break
loops; this is left as future work.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
| -rw-r--r-- | compiler/codeGen/StgCmmExpr.hs | 4 | ||||
| -rw-r--r-- | compiler/codeGen/StgCmmHeap.hs | 57 | ||||
| -rw-r--r-- | compiler/main/DynFlags.hs | 4 | ||||
| -rw-r--r-- | docs/users_guide/using.xml | 18 | 
4 files changed, 60 insertions, 23 deletions
| diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs index 307d3715b3..a8ffc12bb0 100644 --- a/compiler/codeGen/StgCmmExpr.hs +++ b/compiler/codeGen/StgCmmExpr.hs @@ -163,9 +163,7 @@ cgLetNoEscapeClosure bndr cc_slot _unused_cc args body     code = forkProc $ do                    { restoreCurrentCostCentre cc_slot                    ; arg_regs <- bindArgsToRegs args -                  ; void $ altHeapCheck arg_regs (cgExpr body) } -                        -- Using altHeapCheck just reduces -                        -- instructions to save on stack +                  ; void $ noEscapeHeapCheck arg_regs (cgExpr body) }  ------------------------------------------------------------------------ diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs index fb3739177c..b7cca48f5a 100644 --- a/compiler/codeGen/StgCmmHeap.hs +++ b/compiler/codeGen/StgCmmHeap.hs @@ -10,7 +10,7 @@ module StgCmmHeap (          getVirtHp, setVirtHp, setRealHp,          getHpRelOffset, hpRel, -        entryHeapCheck, altHeapCheck, altHeapCheckReturnsTo, +        entryHeapCheck, altHeapCheck, noEscapeHeapCheck, altHeapCheckReturnsTo,          mkVirtHeapOffsets, mkVirtConstrOffsets,          mkStaticClosureFields, mkStaticClosure, @@ -371,7 +371,7 @@ entryHeapCheck cl_info nodeSet arity args code         loop_id <- newLabelC         emitLabel loop_id -       heapCheck True (gc_call updfr_sz <*> mkBranch loop_id) code +       heapCheck True True (gc_call updfr_sz <*> mkBranch loop_id) code  {-      -- This code is slightly outdated now and we could easily keep the above @@ -436,32 +436,41 @@ entryHeapCheck cl_info nodeSet arity args code  --           else we do a normal call to stg_gc_noregs  altHeapCheck :: [LocalReg] -> FCode a -> FCode a -altHeapCheck regs code = do +altHeapCheck regs code = altOrNoEscapeHeapCheck False regs code + +altOrNoEscapeHeapCheck :: Bool -> [LocalReg] -> FCode a -> FCode a +altOrNoEscapeHeapCheck checkYield regs code = do      dflags <- getDynFlags      case cannedGCEntryPoint dflags regs of -      Nothing -> genericGC code +      Nothing -> genericGC checkYield code        Just gc -> do          lret <- newLabelC          let (off, copyin) = copyInOflow dflags NativeReturn (Young lret) regs          lcont <- newLabelC          emitOutOfLine lret (copyin <*> mkBranch lcont)          emitLabel lcont -        cannedGCReturnsTo False gc regs lret off code +        cannedGCReturnsTo checkYield False gc regs lret off code  altHeapCheckReturnsTo :: [LocalReg] -> Label -> ByteOff -> FCode a -> FCode a  altHeapCheckReturnsTo regs lret off code    = do dflags <- getDynFlags         case cannedGCEntryPoint dflags regs of -           Nothing -> genericGC code -           Just gc -> cannedGCReturnsTo True gc regs lret off code +           Nothing -> genericGC False code +           Just gc -> cannedGCReturnsTo False True gc regs lret off code + +-- noEscapeHeapCheck is implemented identically to altHeapCheck (which +-- is more efficient), but cannot be optimized away in the non-allocating +-- case because it may occur in a loop +noEscapeHeapCheck :: [LocalReg] -> FCode a -> FCode a +noEscapeHeapCheck regs code = altOrNoEscapeHeapCheck True regs code -cannedGCReturnsTo :: Bool -> CmmExpr -> [LocalReg] -> Label -> ByteOff +cannedGCReturnsTo :: Bool -> Bool -> CmmExpr -> [LocalReg] -> Label -> ByteOff                    -> FCode a                    -> FCode a -cannedGCReturnsTo cont_on_stack gc regs lret off code +cannedGCReturnsTo checkYield cont_on_stack gc regs lret off code    = do dflags <- getDynFlags         updfr_sz <- getUpdFrameOff -       heapCheck False (gc_call dflags gc updfr_sz) code +       heapCheck False checkYield (gc_call dflags gc updfr_sz) code    where      reg_exprs = map (CmmReg . CmmLocal) regs        -- Note [stg_gc arguments] @@ -470,13 +479,13 @@ cannedGCReturnsTo cont_on_stack gc regs lret off code        | cont_on_stack = mkJumpReturnsTo dflags label GC reg_exprs lret off sp        | otherwise     = mkCallReturnsTo dflags label GC reg_exprs lret off sp (0,[]) -genericGC :: FCode a -> FCode a -genericGC code +genericGC :: Bool -> FCode a -> FCode a +genericGC checkYield code    = do updfr_sz <- getUpdFrameOff         lretry <- newLabelC         emitLabel lretry         call <- mkCall generic_gc (GC, GC) [] [] updfr_sz (0,[]) -       heapCheck False (call <*> mkBranch lretry) code +       heapCheck False checkYield (call <*> mkBranch lretry) code  cannedGCEntryPoint :: DynFlags -> [LocalReg] -> Maybe CmmExpr  cannedGCEntryPoint dflags regs @@ -524,22 +533,23 @@ mkGcLabel :: String -> CmmExpr  mkGcLabel s = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit s)))  ------------------------------- -heapCheck :: Bool -> CmmAGraph -> FCode a -> FCode a -heapCheck checkStack do_gc code +heapCheck :: Bool -> Bool -> CmmAGraph -> FCode a -> FCode a +heapCheck checkStack checkYield do_gc code    = getHeapUsage $ \ hpHw ->      -- Emit heap checks, but be sure to do it lazily so      -- that the conditionals on hpHw don't cause a black hole -    do  { codeOnly $ do_checks checkStack hpHw do_gc +    do  { codeOnly $ do_checks checkStack checkYield hpHw do_gc          ; tickyAllocHeap hpHw          ; doGranAllocate hpHw          ; setRealHp hpHw          ; code }  do_checks :: Bool       -- Should we check the stack? +          -> Bool       -- Should we check for preemption?            -> WordOff    -- Heap headroom            -> CmmAGraph  -- What to do on failure            -> FCode () -do_checks checkStack alloc do_gc = do +do_checks checkStack checkYield alloc do_gc = do    dflags <- getDynFlags    let      alloc_lit = mkIntExpr dflags (alloc * wORD_SIZE dflags) -- Bytes @@ -557,15 +567,22 @@ do_checks checkStack alloc do_gc = do      hp_oflo = CmmMachOp (mo_wordUGt dflags)                          [CmmReg hpReg, CmmReg (CmmGlobal HpLim)] +    -- Yielding if HpLim == 0 +    yielding = CmmMachOp (mo_wordEq dflags) +                        [CmmReg (CmmGlobal HpLim), CmmLit (zeroCLit dflags)] +      alloc_n = mkAssign (CmmGlobal HpAlloc) alloc_lit    gc_id <- newLabelC    when checkStack $ do       emit =<< mkCmmIfGoto sp_oflo gc_id -  when (alloc /= 0) $ do -     emitAssign hpReg bump_hp -     emit =<< mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id) +  if (alloc /= 0) +    then do +      emitAssign hpReg bump_hp +      emit =<< mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id) +    else do +      when (not (dopt Opt_OmitYields dflags) && checkYield) (emit =<< mkCmmIfGoto yielding gc_id)    emitOutOfLine gc_id $       do_gc -- this is expected to jump back somewhere diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs index ed273d90e5..b412fc1166 100644 --- a/compiler/main/DynFlags.hs +++ b/compiler/main/DynFlags.hs @@ -293,6 +293,7 @@ data DynFlag     | Opt_IrrefutableTuples     | Opt_CmmSink     | Opt_CmmElimCommonBlocks +   | Opt_OmitYields     -- Interface files     | Opt_IgnoreInterfacePragmas @@ -2275,6 +2276,7 @@ fFlags = [    ( "irrefutable-tuples",               Opt_IrrefutableTuples, nop ),    ( "cmm-sink",                         Opt_CmmSink, nop ),    ( "cmm-elim-common-blocks",           Opt_CmmElimCommonBlocks, nop ), +  ( "omit-yields",                      Opt_OmitYields, nop ),    ( "gen-manifest",                     Opt_GenManifest, nop ),    ( "embed-manifest",                   Opt_EmbedManifest, nop ),    ( "ext-core",                         Opt_EmitExternalCore, nop ), @@ -2459,6 +2461,8 @@ defaultFlags platform        Opt_SharedImplib, +      Opt_OmitYields, +        Opt_GenManifest,        Opt_EmbedManifest,        Opt_PrintBindContents, diff --git a/docs/users_guide/using.xml b/docs/users_guide/using.xml index 2c5217b40d..c3a1366f43 100644 --- a/docs/users_guide/using.xml +++ b/docs/users_guide/using.xml @@ -2332,6 +2332,24 @@ last (x : xs) = last' x xs            </listitem>          </varlistentry> +        <varlistentry> +          <term> +            <option>-fomit-yields</option> +            <indexterm><primary><option>-fomit-yields</option></primary></indexterm> +          </term> +          <listitem> +              <para><emphasis>On by default.</emphasis>  Tells GHC to omit +            heap checks when no allocation is being performed.  While this improves +            binary sizes by about 5%, it also means that threads run in +            tight non-allocating loops will not get preempted in a timely +            fashion.  If it is important to always be able to interrupt such +            threads, you should turn this optimization off.  Consider also +            recompiling all libraries with this optimization turned off, if you +            need to guarantee interruptibility. +            </para> +          </listitem> +        </varlistentry> +        </variablelist>      </sect2> | 
