9 files changed, 411 insertions, 227 deletions
diff --git a/compiler/GHC/Core/Coercion.hs b/compiler/GHC/Core/Coercion.hs
index c749ed0280..3937c0ce3e 100644
--- a/compiler/GHC/Core/Coercion.hs
+++ b/compiler/GHC/Core/Coercion.hs
@@ -2866,7 +2866,7 @@ simplifyArgsWorker orig_ki_binders orig_inner_ki orig_fvs
          -- The bangs here have been observed to improve performance
          -- significantly in optimized builds.
          let kind_co = mkSymCo $
-               liftCoSubst Nominal lc (tyCoBinderType binder)
+                       liftCoSubst Nominal lc (tyCoBinderType binder)
              !casted_xi = xi `mkCastTy` kind_co
              casted_co =  mkCoherenceLeftCo role xi kind_co co
 
diff --git a/compiler/GHC/Core/Opt/OccurAnal.hs b/compiler/GHC/Core/Opt/OccurAnal.hs
index 500c2bdab6..12ffcbb587 100644
--- a/compiler/GHC/Core/Opt/OccurAnal.hs
+++ b/compiler/GHC/Core/Opt/OccurAnal.hs
@@ -832,7 +832,7 @@ occAnalNonRecBind env lvl imp_rule_edges bndr rhs body_usage
 
     certainly_inline -- See Note [Cascading inlines]
       = case occ of
-          OneOcc { occ_in_lam = NotInsideLam, occ_one_br = InOneBranch }
+          OneOcc { occ_in_lam = NotInsideLam, occ_n_br = 1 }
             -> active && not_stable
           _ -> False
 
@@ -2563,7 +2563,7 @@ mkOneOcc id int_cxt arity
   = emptyDetails
   where
     occ_info = OneOcc { occ_in_lam  = NotInsideLam
-                      , occ_one_br  = InOneBranch
+                      , occ_n_br    = oneBranch
                       , occ_int_cxt = int_cxt
                       , occ_tail    = AlwaysTailCalled arity }
 
@@ -2967,11 +2967,15 @@ addOccInfo a1 a2  = ASSERT( not (isDeadOcc a1 || isDeadOcc a2) )
 -- (orOccInfo orig new) is used
 -- when combining occurrence info from branches of a case
 
-orOccInfo (OneOcc { occ_in_lam = in_lam1, occ_int_cxt = int_cxt1
-                  , occ_tail   = tail1 })
-          (OneOcc { occ_in_lam = in_lam2, occ_int_cxt = int_cxt2
-                  , occ_tail   = tail2 })
-  = OneOcc { occ_one_br  = MultipleBranches -- because it occurs in both branches
+orOccInfo (OneOcc { occ_in_lam  = in_lam1
+                  , occ_n_br    = nbr1
+                  , occ_int_cxt = int_cxt1
+                  , occ_tail    = tail1 })
+          (OneOcc { occ_in_lam  = in_lam2
+                  , occ_n_br    = nbr2
+                  , occ_int_cxt = int_cxt2
+                  , occ_tail    = tail2 })
+  = OneOcc { occ_n_br    = nbr1 + nbr2
            , occ_in_lam  = in_lam1 `mappend` in_lam2
            , occ_int_cxt = int_cxt1 `mappend` int_cxt2
            , occ_tail    = tail1 `andTailCallInfo` tail2 }
diff --git a/compiler/GHC/Core/Opt/SetLevels.hs b/compiler/GHC/Core/Opt/SetLevels.hs
index 91e9f6ec34..efcf96e6df 100644
--- a/compiler/GHC/Core/Opt/SetLevels.hs
+++ b/compiler/GHC/Core/Opt/SetLevels.hs
@@ -658,8 +658,8 @@ lvlMFE env strict_ctxt e@(_, AnnCase {})
 lvlMFE env strict_ctxt ann_expr
   |  floatTopLvlOnly env && not (isTopLvl dest_lvl)
          -- Only floating to the top level is allowed.
-  || anyDVarSet isJoinId fvs   -- If there is a free join, don't float
-                               -- See Note [Free join points]
+  || hasFreeJoin env fvs   -- If there is a free join, don't float
+                           -- See Note [Free join points]
   || isExprLevPoly expr
          -- We can't let-bind levity polymorphic expressions
          -- See Note [Levity polymorphism invariants] in GHC.Core
@@ -755,6 +755,14 @@ lvlMFE env strict_ctxt ann_expr
                 && floatConsts env
                 && (not strict_ctxt || is_bot || exprIsHNF expr)
 
+hasFreeJoin :: LevelEnv -> DVarSet -> Bool
+-- Has a free join point which is not being floated to top level.
+-- (In the latter case it won't be a join point any more.)
+-- Not treating top-level ones specially had a massive effect
+-- on nofib/minimax/Prog.prog
+hasFreeJoin env fvs
+  = not (maxFvLevel isJoinId env fvs == tOP_LEVEL)
+
 isBottomThunk :: Maybe (Arity, s) -> Bool
 -- See Note [Bottoming floats] (2)
 isBottomThunk (Just (0, _)) = True   -- Zero arity
diff --git a/compiler/GHC/Core/Opt/Simplify.hs b/compiler/GHC/Core/Opt/Simplify.hs
index abfad1940f..e7fc0fbced 100644
--- a/compiler/GHC/Core/Opt/Simplify.hs
+++ b/compiler/GHC/Core/Opt/Simplify.hs
@@ -6,7 +6,7 @@
 
 {-# LANGUAGE CPP #-}
 
-{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}
+{-# OPTIONS_GHC -Wno-incomplete-record-updates -Wno-incomplete-uni-patterns #-}
 module GHC.Core.Opt.Simplify ( simplTopBinds, simplExpr, simplRules ) where
 
 #include "HsVersions.h"
@@ -39,8 +39,8 @@ import GHC.Core.Opt.Monad ( Tick(..), SimplMode(..) )
 import GHC.Core
 import GHC.Builtin.Types.Prim( realWorldStatePrimTy )
 import GHC.Builtin.Names( runRWKey )
-import GHC.Types.Demand ( StrictSig(..), dmdTypeDepth, isStrictDmd
-                        , mkClosedStrictSig, topDmd, botDiv )
+import GHC.Types.Demand ( StrictSig(..), Demand, dmdTypeDepth, isStrictDmd
+                        , mkClosedStrictSig, topDmd, seqDmd, botDiv )
 import GHC.Types.Cpr    ( mkCprSig, botCpr )
 import GHC.Core.Ppr     ( pprCoreExpr )
 import GHC.Types.Unique ( hasKey )
@@ -598,7 +598,7 @@ prepareRhs mode top_lvl occ rhs0
         = do { (is_exp, floats1, fun') <- go (n_val_args+1) fun
              ; case is_exp of
                 False -> return (False, emptyLetFloats, App fun arg)
-                True  -> do { (floats2, arg') <- makeTrivial mode top_lvl occ arg
+                True  -> do { (floats2, arg') <- makeTrivial mode top_lvl topDmd occ arg
                             ; return (True, floats1 `addLetFlts` floats2, App fun' arg') } }
     go n_val_args (Var fun)
         = return (is_exp, emptyLetFloats, Var fun)
@@ -628,32 +628,34 @@ prepareRhs mode top_lvl occ rhs0
         = return (False, emptyLetFloats, other)
 
 makeTrivialArg :: SimplMode -> ArgSpec -> SimplM (LetFloats, ArgSpec)
-makeTrivialArg mode arg@(ValArg { as_arg = e })
-  = do { (floats, e') <- makeTrivial mode NotTopLevel (fsLit "arg") e
+makeTrivialArg mode arg@(ValArg { as_arg = e, as_dmd = dmd })
+  = do { (floats, e') <- makeTrivial mode NotTopLevel dmd (fsLit "arg") e
        ; return (floats, arg { as_arg = e' }) }
 makeTrivialArg _ arg
   = return (emptyLetFloats, arg)  -- CastBy, TyArg
 
-makeTrivial :: SimplMode -> TopLevelFlag
+makeTrivial :: SimplMode -> TopLevelFlag -> Demand
             -> FastString  -- ^ A "friendly name" to build the new binder from
             -> OutExpr     -- ^ This expression satisfies the let/app invariant
             -> SimplM (LetFloats, OutExpr)
 -- Binds the expression to a variable, if it's not trivial, returning the variable
-makeTrivial mode top_lvl occ_fs expr
+-- For the Demand argument, see Note [Keeping demand info in StrictArg Plan A]
+makeTrivial mode top_lvl dmd occ_fs expr
   | exprIsTrivial expr                          -- Already trivial
   || not (bindingOk top_lvl expr expr_ty)       -- Cannot trivialise
                                                 --   See Note [Cannot trivialise]
   = return (emptyLetFloats, expr)
 
   | Cast expr' co <- expr
-  = do { (floats, triv_expr) <- makeTrivial mode top_lvl occ_fs expr'
+  = do { (floats, triv_expr) <- makeTrivial mode top_lvl dmd occ_fs expr'
        ; return (floats, Cast triv_expr co) }
 
   | otherwise
   = do { (floats, new_id) <- makeTrivialBinding mode top_lvl occ_fs
-                                                vanillaIdInfo expr expr_ty
+                                                id_info expr expr_ty
        ; return (floats, Var new_id) }
   where
+    id_info = vanillaIdInfo `setDemandInfo` dmd
     expr_ty = exprType expr
 
 makeTrivialBinding :: SimplMode -> TopLevelFlag
@@ -1010,13 +1012,17 @@ simplExprF1 env (App fun arg) cont
           -- (instead of one-at-a-time). But in practice, we have not
           -- observed the quadratic behavior, so this extra entanglement
           -- seems not worthwhile.
+          --
+          -- But the (exprType fun) is repeated, to push it into two
+          -- separate, rarely used, thunks; rather than always alloating
+          -- a shared thunk.  Makes a small efficiency difference
         let fun_ty = exprType fun
             (m, _, _) = splitFunTy fun_ty
         in
-                simplExprF env fun $
-                 ApplyToVal { sc_arg = arg, sc_env = env
-                            , sc_hole_ty = substTy env (exprType fun)
-                            , sc_dup = NoDup, sc_cont = cont, sc_mult = m }
+        simplExprF env fun $
+        ApplyToVal { sc_arg = arg, sc_env = env
+                   , sc_hole_ty = substTy env (exprType fun)
+                   , sc_dup = NoDup, sc_cont = cont, sc_mult = m }
 
 simplExprF1 env expr@(Lam {}) cont
   = {-#SCC "simplExprF1-Lam" #-}
@@ -1567,7 +1573,7 @@ simplNonRecE env bndr (rhs, rhs_se) (bndrs, body) cont
          simplLam env' bndrs body cont }
 
   -- Deal with strict bindings
-  | isStrictId bndr          -- Includes coercions
+  | isStrictId bndr          -- Includes coercions, and unlifted types
   , sm_case_case (getMode env)
   = simplExprF (rhs_se `setInScopeFromE` env) rhs
                (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs, sc_body = body
@@ -1924,7 +1930,7 @@ rebuildCall :: SimplEnv
 --    - and rebuild
 
 ---------- Bottoming applications --------------
-rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args, ai_strs = [] }) cont
+rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args, ai_dmds = [] }) cont
   -- When we run out of strictness args, it means
   -- that the call is definitely bottom; see GHC.Core.Opt.Simplify.Utils.mkArgInfo
   -- Then we want to discard the entire strict continuation.  E.g.
@@ -1974,9 +1980,9 @@ rebuildCall env info (ApplyToTy { sc_arg_ty = arg_ty, sc_hole_ty = hole_ty, sc_c
 ---------- The runRW# rule. Do this after absorbing all arguments ------
 -- runRW# :: forall (r :: RuntimeRep) (o :: TYPE r). (State# RealWorld -> o) -> o
 -- K[ runRW# rr ty body ]  -->  runRW rr' ty' (\s. K[ body s ])
-rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args })
+rebuildCall env (ArgInfo { ai_fun = fun_id, ai_args = rev_args })
             (ApplyToVal { sc_arg = arg, sc_env = arg_se, sc_cont = cont, sc_mult = m })
-  | fun `hasKey` runRWKey
+  | fun_id `hasKey` runRWKey
   , not (contIsStop cont)  -- Don't fiddle around if the continuation is boring
   , [ TyArg {}, TyArg {} ] <- rev_args
   = do { s <- newId (fsLit "s") Many realWorldStatePrimTy
@@ -1990,25 +1996,24 @@ rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args })
        ; body' <- simplExprC env' arg cont'
        ; let arg'  = Lam s body'
              rr'   = getRuntimeRep ty'
-             call' = mkApps (Var fun) [mkTyArg rr', mkTyArg ty', arg']
+             call' = mkApps (Var fun_id) [mkTyArg rr', mkTyArg ty', arg']
        ; return (emptyFloats env, call') }
 
-rebuildCall env info@(ArgInfo { ai_encl = encl_rules
-                              , ai_strs = str:strs, ai_discs = disc:discs })
+rebuildCall env fun_info
             (ApplyToVal { sc_arg = arg, sc_env = arg_se
                         , sc_dup = dup_flag, sc_hole_ty = fun_ty
                         , sc_cont = cont, sc_mult = m })
   -- Argument is already simplified
   | isSimplified dup_flag     -- See Note [Avoid redundant simplification]
-  = rebuildCall env (addValArgTo info' (m, arg) fun_ty) cont
+  = rebuildCall env (addValArgTo fun_info (m, arg) fun_ty) cont
 
   -- Strict arguments
-  | str
+  | isStrictArgInfo fun_info
   , sm_case_case (getMode env)
   = -- pprTrace "Strict Arg" (ppr arg $$ ppr (seIdSubst env) $$ ppr (seInScope env)) $
     simplExprF (arg_se `setInScopeFromE` env) arg
-               (StrictArg { sc_fun = info', sc_cci = cci_strict
-                          , sc_dup = Simplified, sc_fun_ty = fun_ty
+               (StrictArg { sc_fun = fun_info, sc_fun_ty = fun_ty
+                          , sc_dup = Simplified
                           , sc_cont = cont, sc_mult = m })
                 -- Note [Shadowing]
 
@@ -2019,27 +2024,11 @@ rebuildCall env info@(ArgInfo { ai_encl = encl_rules
         -- have to be very careful about bogus strictness through
         -- floating a demanded let.
   = do  { arg' <- simplExprC (arg_se `setInScopeFromE` env) arg
-                             (mkLazyArgStop arg_ty cci_lazy)
-        ; rebuildCall env (addValArgTo info' (m, arg') fun_ty) cont }
+                             (mkLazyArgStop arg_ty (lazyArgContext fun_info))
+        ; rebuildCall env (addValArgTo fun_info (m, arg') fun_ty) cont }
   where
-    info'  = info { ai_strs = strs, ai_discs = discs }
     arg_ty = funArgTy fun_ty
 
-    -- Use this for lazy arguments
-    cci_lazy | encl_rules = RuleArgCtxt
-             | disc > 0   = DiscArgCtxt  -- Be keener here
-             | otherwise  = BoringCtxt   -- Nothing interesting
-
-    -- ..and this for strict arguments
-    cci_strict | encl_rules = RuleArgCtxt
-               | disc > 0   = DiscArgCtxt
-               | otherwise  = RhsCtxt
-      -- Why RhsCtxt?  if we see f (g x) (h x), and f is strict, we
-      -- want to be a bit more eager to inline g, because it may
-      -- expose an eval (on x perhaps) that can be eliminated or
-      -- shared. I saw this in nofib 'boyer2', RewriteFuns.onewayunify1
-      -- It's worth an 18% improvement in allocation for this
-      -- particular benchmark; 5% on 'mate' and 1.3% on 'multiplier'
 
 ---------- No further useful info, revert to generic rebuild ------------
 rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args }) cont
@@ -2243,6 +2232,7 @@ trySeqRules in_env scrut rhs cont
                 , TyArg { as_arg_ty  = rhs_ty
                         , as_hole_ty = res2_ty }
                 , ValArg { as_arg = no_cast_scrut
+                         , as_dmd = seqDmd
                          , as_hole_ty = res3_ty
                          , as_mult = Many } ]
                 -- The multiplicity of the scrutiny above is Many because the type
@@ -3268,31 +3258,41 @@ altsWouldDup (alt:alts)
     is_bot_alt (_,_,rhs) = exprIsDeadEnd rhs
 
 -------------------------
-mkDupableCont :: SimplEnv -> SimplCont
+mkDupableCont :: SimplEnv
+              -> SimplCont
               -> SimplM ( SimplFloats  -- Incoming SimplEnv augmented with
                                        --   extra let/join-floats and in-scope variables
                         , SimplCont)   -- dup_cont: duplicable continuation
-
 mkDupableCont env cont
+  = mkDupableContWithDmds env (repeat topDmd) cont
+
+mkDupableContWithDmds
+   :: SimplEnv  -> [Demand]  -- Demands on arguments; always infinite
+   -> SimplCont -> SimplM ( SimplFloats, SimplCont)
+
+mkDupableContWithDmds env _ cont
   | contIsDupable cont
   = return (emptyFloats env, cont)
 
-mkDupableCont _ (Stop {}) = panic "mkDupableCont"     -- Handled by previous eqn
+mkDupableContWithDmds _ _ (Stop {}) = panic "mkDupableCont"     -- Handled by previous eqn
 
-mkDupableCont env (CastIt ty cont)
-  = do  { (floats, cont') <- mkDupableCont env cont
+mkDupableContWithDmds env dmds (CastIt ty cont)
+  = do  { (floats, cont') <- mkDupableContWithDmds env dmds cont
         ; return (floats, CastIt ty cont') }
 
 -- Duplicating ticks for now, not sure if this is good or not
-mkDupableCont env (TickIt t cont)
-  = do  { (floats, cont') <- mkDupableCont env cont
+mkDupableContWithDmds env dmds (TickIt t cont)
+  = do  { (floats, cont') <- mkDupableContWithDmds env dmds cont
         ; return (floats, TickIt t cont') }
 
-mkDupableCont env (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs
-                              , sc_body = body, sc_env = se, sc_cont = cont})
-  -- See Note [Duplicating StrictBind]
+mkDupableContWithDmds env _
+     (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs
+                 , sc_body = body, sc_env = se, sc_cont = cont})
+-- See Note [Duplicating StrictBind]
+-- K[ let x = <> in b ]  -->   join j x = K[ b ]
+--                             j <>
   = do { let sb_env = se `setInScopeFromE` env
-       ; (sb_env1, bndr') <- simplBinder sb_env bndr
+       ; (sb_env1, bndr')      <- simplBinder sb_env bndr
        ; (floats1, join_inner) <- simplLam sb_env1 bndrs body cont
           -- No need to use mkDupableCont before simplLam; we
           -- use cont once here, and then share the result if necessary
@@ -3300,56 +3300,66 @@ mkDupableCont env (StrictBind { sc_bndr = bndr, sc_bndrs = bndrs
        ; let join_body = wrapFloats floats1 join_inner
              res_ty    = contResultType cont
 
-       ; (floats2, body2)
-            <- if exprIsDupable (targetPlatform (seDynFlags env)) join_body
-               then return (emptyFloats env, join_body)
-               else do { join_bndr <- newJoinId [bndr'] res_ty
-                       ; let join_call = App (Var join_bndr) (Var bndr')
-                             join_rhs  = Lam (setOneShotLambda bndr') join_body
-                             join_bind = NonRec join_bndr join_rhs
-                             floats    = emptyFloats env `extendFloats` join_bind
-                       ; return (floats, join_call) }
-       ; return ( floats2
-                , StrictBind { sc_bndr = bndr', sc_bndrs = []
-                             , sc_body = body2
-                             , sc_env  = zapSubstEnv se `setInScopeFromF` floats2
-                                         -- See Note [StaticEnv invariant] in GHC.Core.Opt.Simplify.Utils
-                             , sc_dup  = OkToDup
-                             , sc_cont = mkBoringStop res_ty } ) }
-
-mkDupableCont env (StrictArg { sc_fun = info, sc_cci = cci
-                             , sc_cont = cont, sc_fun_ty = fun_ty, sc_mult = m })
-        -- See Note [Duplicating StrictArg]
-        -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
-  = do { (floats1, cont') <- mkDupableCont env cont
+       ; mkDupableStrictBind env bndr' join_body res_ty }
+
+mkDupableContWithDmds env _
+    (StrictArg { sc_fun = fun, sc_cont = cont
+               , sc_fun_ty = fun_ty, sc_mult = m })
+  -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
+  | thumbsUpPlanA cont
+  = -- Use Plan A of Note [Duplicating StrictArg]
+    do { let (_ : dmds) = ai_dmds fun
+       ; (floats1, cont')  <- mkDupableContWithDmds env dmds cont
+                              -- Use the demands from the function to add the right
+                              -- demand info on any bindings we make for further args
        ; (floats_s, args') <- mapAndUnzipM (makeTrivialArg (getMode env))
-                                           (ai_args info)
+                                           (ai_args fun)
        ; return ( foldl' addLetFloats floats1 floats_s
-                , StrictArg { sc_fun = info { ai_args = args' }
+                , StrictArg { sc_fun = fun { ai_args = args' }
                             , sc_cont = cont'
-                            , sc_cci = cci
                             , sc_fun_ty = fun_ty
                             , sc_mult = m
                             , sc_dup = OkToDup} ) }
 
-mkDupableCont env (ApplyToTy { sc_cont = cont
-                             , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty })
-  = do  { (floats, cont') <- mkDupableCont env cont
+  | otherwise
+  = -- Use Plan B of Note [Duplicating StrictArg]
+    --   K[ f a b <> ]   -->   join j x = K[ f a b x ]
+    --                         j <>
+    do { let arg_ty = funArgTy fun_ty
+             rhs_ty = contResultType cont
+       ; arg_bndr <- newId (fsLit "arg") m arg_ty   -- ToDo: check this linearity argument
+       ; let env' = env `addNewInScopeIds` [arg_bndr]
+       ; (floats, join_rhs) <- rebuildCall env' (addValArgTo fun (m, Var arg_bndr) fun_ty) cont
+       ; mkDupableStrictBind env' arg_bndr (wrapFloats floats join_rhs) rhs_ty }
+  where
+    thumbsUpPlanA (StrictArg {})               = False
+    thumbsUpPlanA (CastIt _ k)                 = thumbsUpPlanA k
+    thumbsUpPlanA (TickIt _ k)                 = thumbsUpPlanA k
+    thumbsUpPlanA (ApplyToVal { sc_cont = k }) = thumbsUpPlanA k
+    thumbsUpPlanA (ApplyToTy  { sc_cont = k }) = thumbsUpPlanA k
+    thumbsUpPlanA (Select {})                  = True
+    thumbsUpPlanA (StrictBind {})              = True
+    thumbsUpPlanA (Stop {})                    = True
+
+mkDupableContWithDmds env dmds
+    (ApplyToTy { sc_cont = cont, sc_arg_ty = arg_ty, sc_hole_ty = hole_ty })
+  = do  { (floats, cont') <- mkDupableContWithDmds env dmds cont
         ; return (floats, ApplyToTy { sc_cont = cont'
                                     , sc_arg_ty = arg_ty, sc_hole_ty = hole_ty }) }
 
-mkDupableCont env (ApplyToVal { sc_arg = arg, sc_dup = dup
-                              , sc_env = se, sc_cont = cont
-                              , sc_hole_ty = hole_ty, sc_mult = mult })
+mkDupableContWithDmds env dmds
+    (ApplyToVal { sc_arg = arg, sc_dup = dup, sc_env = se
+                , sc_cont = cont, sc_hole_ty = hole_ty, sc_mult = mult })
   =     -- e.g.         [...hole...] (...arg...)
         --      ==>
         --              let a = ...arg...
         --              in [...hole...] a
         -- NB: sc_dup /= OkToDup; that is caught earlier by contIsDupable
-    do  { (floats1, cont') <- mkDupableCont env cont
+    do  { let (dmd:_) = dmds   -- Never fails
+        ; (floats1, cont') <- mkDupableContWithDmds env dmds cont
         ; let env' = env `setInScopeFromF` floats1
         ; (_, se', arg') <- simplArg env' dup se arg
-        ; (let_floats2, arg'') <- makeTrivial (getMode env) NotTopLevel (fsLit "karg") arg'
+        ; (let_floats2, arg'') <- makeTrivial (getMode env) NotTopLevel dmd (fsLit "karg") arg'
         ; let all_floats = floats1 `addLetFloats` let_floats2
         ; return ( all_floats
                  , ApplyToVal { sc_arg = arg''
@@ -3361,8 +3371,8 @@ mkDupableCont env (ApplyToVal { sc_arg = arg, sc_dup = dup
                               , sc_dup = OkToDup, sc_cont = cont'
                               , sc_hole_ty = hole_ty, sc_mult = mult }) }
 
-mkDupableCont env (Select { sc_bndr = case_bndr, sc_alts = alts
-                          , sc_env = se, sc_cont = cont })
+mkDupableContWithDmds env _
+    (Select { sc_bndr = case_bndr, sc_alts = alts, sc_env = se, sc_cont = cont })
   =     -- e.g.         (case [...hole...] of { pi -> ei })
         --      ===>
         --              let ji = \xij -> ei
@@ -3404,6 +3414,34 @@ mkDupableCont env (Select { sc_bndr = case_bndr, sc_alts = alts
                                       -- See Note [StaticEnv invariant] in GHC.Core.Opt.Simplify.Utils
                           , sc_cont = mkBoringStop (contResultType cont) } ) }
 
+mkDupableStrictBind :: SimplEnv -> OutId -> OutExpr -> OutType
+                    -> SimplM (SimplFloats, SimplCont)
+mkDupableStrictBind env arg_bndr join_rhs res_ty
+  | exprIsDupable (targetPlatform (seDynFlags env)) join_rhs
+  = return (emptyFloats env
+           , StrictBind { sc_bndr = arg_bndr, sc_bndrs = []
+                        , sc_body = join_rhs
+                        , sc_env  = zapSubstEnv env
+                          -- See Note [StaticEnv invariant] in GHC.Core.Opt.Simplify.Utils
+                        , sc_dup  = OkToDup
+                        , sc_cont = mkBoringStop res_ty } )
+  | otherwise
+  = do { join_bndr <- newJoinId [arg_bndr] res_ty
+       ; let arg_info = ArgInfo { ai_fun   = join_bndr
+                                , ai_rules = Nothing, ai_args  = []
+                                , ai_encl  = False, ai_dmds  = repeat topDmd
+                                , ai_discs = repeat 0 }
+       ; return ( addJoinFloats (emptyFloats env) $
+                  unitJoinFloat                   $
+                  NonRec join_bndr                $
+                  Lam (setOneShotLambda arg_bndr) join_rhs
+                , StrictArg { sc_dup    = OkToDup
+                            , sc_fun    = arg_info
+                            , sc_fun_ty = idType join_bndr
+                            , sc_cont   = mkBoringStop res_ty
+                            , sc_mult   = Many   -- ToDo: check this!
+                            } ) }
+
 mkDupableAlt :: Platform -> OutId
              -> JoinFloats -> OutAlt
              -> SimplM (JoinFloats, OutAlt)
@@ -3577,57 +3615,102 @@ type variables as well as term variables.
 
 Note [Duplicating StrictArg]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We make a StrictArg duplicable simply by making all its
-stored-up arguments (in sc_fun) trivial, by let-binding
-them.  Thus:
-        f E [..hole..]
-        ==>     let a = E
-                in f a [..hole..]
-Now if the thing in the hole is a case expression (which is when
-we'll call mkDupableCont), we'll push the function call into the
-branches, which is what we want.  Now RULES for f may fire, and
-call-pattern specialisation.  Here's an example from #3116
+Dealing with making a StrictArg continuation duplicable has turned out
+to be one of the trickiest corners of the simplifier, giving rise
+to several cases in which the simplier expanded the program's size
+*exponentially*.  They include
+  #13253 exponential inlining
+  #10421 ditto
+  #18140 strict constructors
+  #18282 another nested-function call case
+
+Suppose we have a call
+  f e1 (case x of { True -> r1; False -> r2 }) e3
+and f is strict in its second argument.  Then we end up in
+mkDupableCont with a StrictArg continuation for (f e1 <> e3).
+There are two ways to make it duplicable.
+
+* Plan A: move the entire call inwards, being careful not
+  to duplicate e1 or e3, thus:
+     let a1 = e1
+         a3 = e3
+     in case x of { True  -> f a1 r1 a3
+                  ; False -> f a1 r2 a3 }
+
+* Plan B: make a join point:
+     join $j x = f e1 x e3
+     in case x of { True  -> jump $j r1
+                  ; False -> jump $j r2 }
+  Notice that Plan B is very like the way we handle strict
+  bindings; see Note [Duplicating StrictBind].
+
+Plan A is good. Here's an example from #3116
      go (n+1) (case l of
                  1  -> bs'
                  _  -> Chunk p fpc (o+1) (l-1) bs')
-If we can push the call for 'go' inside the case, we get
+
+If we pushed the entire call for 'go' inside the case, we get
 call-pattern specialisation for 'go', which is *crucial* for
-this program.
+this particular program.
 
-Here is the (&&) example:
-        && E (case x of { T -> F; F -> T })
-  ==>   let a = E in
-        case x of { T -> && a F; F -> && a T }
-Much better!
-
-Notice that
-  * Arguments to f *after* the strict one are handled by
-    the ApplyToVal case of mkDupableCont.  Eg
-        f [..hole..] E
-
-  * We can only do the let-binding of E because the function
-    part of a StrictArg continuation is an explicit syntax
-    tree.  In earlier versions we represented it as a function
-    (CoreExpr -> CoreEpxr) which we couldn't take apart.
-
-Historical aide: previously we did this (where E is a
-big argument:
-        f E [..hole..]
-        ==>     let $j = \a -> f E a
-                in $j [..hole..]
-
-But this is terrible! Here's an example:
+Here is another example.
         && E (case x of { T -> F; F -> T })
-Now, && is strict so we end up simplifying the case with
-an ArgOf continuation.  If we let-bind it, we get
-        let $j = \v -> && E v
-        in simplExpr (case x of { T -> F; F -> T })
-                     (ArgOf (\r -> $j r)
-And after simplifying more we get
-        let $j = \v -> && E v
-        in case x of { T -> $j F; F -> $j T }
-Which is a Very Bad Thing
 
+Pushing the call inward (being careful not to duplicate E)
+        let a = E
+        in case x of { T -> && a F; F -> && a T }
+
+and now the (&& a F) etc can optimise.  Moreover there might
+be a RULE for the function that can fire when it "sees" the
+particular case alterantive.
+
+But Plan A can have terrible, terrible behaviour. Here is a classic
+case:
+  f (f (f (f (f True))))
+
+Suppose f is strict, and has a body that is small enough to inline.
+The innermost call inlines (seeing the True) to give
+  f (f (f (f (case v of { True -> e1; False -> e2 }))))
+
+Now, suppose we naively push the entire continuation into both
+case branches (it doesn't look large, just f.f.f.f). We get
+  case v of
+    True  -> f (f (f (f e1)))
+    False -> f (f (f (f e2)))
+
+And now the process repeats, so we end up with an exponentially large
+number of copies of f. No good!
+
+CONCLUSION: we want Plan A in general, but do Plan B is there a
+danger of this nested call behaviour. The function that decides
+this is called thumbsUpPlanA.
+
+Note [Keeping demand info in StrictArg Plan A]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Following on from Note [Duplicating StrictArg], another common code
+pattern that can go bad is this:
+   f (case x1 of { T -> F; F -> T })
+     (case x2 of { T -> F; F -> T })
+     ...etc...
+when f is strict in all its arguments.  (It might, for example, be a
+strict data constructor whose wrapper has not yet been inlined.)
+
+We use Plan A (because there is no nesting) giving
+  let a2 = case x2 of ...
+      a3 = case x3 of ...
+  in case x1 of { T -> f F a2 a3 ... ; F -> f T a2 a3 ... }
+
+Now we must be careful!  a2 and a3 are small, and the OneOcc code in
+postInlineUnconditionally may inline them both at both sites; see Note
+Note [Inline small things to avoid creating a thunk] in
+Simplify.Utils. But if we do inline them, the entire process will
+repeat -- back to exponential behaviour.
+
+So we are careful to keep the demand-info on a2 and a3.  Then they'll
+be /strict/ let-bindings, which will be dealt with by StrictBind.
+That's why contIsDupableWithDmds is careful to propagage demand
+info to the auxiliary bindings it creates.  See the Demand argument
+to makeTrivial.
 
 Note [Duplicating StrictBind]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -3636,9 +3719,10 @@ that for case expressions.  After all,
    let x* = e in b   is similar to    case e of x -> b
 
 So we potentially make a join-point for the body, thus:
-   let x = [] in b   ==>   join j x = b
-                           in let x = [] in j x
+   let x = <> in b   ==>   join j x = b
+                           in j <>
 
+Just like StrictArg in fact -- and indeed they share code.
 
 Note [Join point abstraction]  Historical note
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compiler/GHC/Core/Opt/Simplify/Utils.hs b/compiler/GHC/Core/Opt/Simplify/Utils.hs
index e9ee16157f..0d3a577938 100644
--- a/compiler/GHC/Core/Opt/Simplify/Utils.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Utils.hs
@@ -29,6 +29,7 @@ module GHC.Core.Opt.Simplify.Utils (
         ArgInfo(..), ArgSpec(..), mkArgInfo,
         addValArgTo, addCastTo, addTyArgTo,
         argInfoExpr, argInfoAppArgs, pushSimplifiedArgs,
+        isStrictArgInfo, lazyArgContext,
 
         abstractFloats,
 
@@ -153,8 +154,9 @@ data SimplCont
   | StrictArg           -- (StrictArg (f e1 ..en) K)[e] = K[ f e1 .. en e ]
       { sc_dup  :: DupFlag     -- Always Simplified or OkToDup
       , sc_fun  :: ArgInfo     -- Specifies f, e1..en, Whether f has rules, etc
-                               --     plus strictness flags for *further* args
-      , sc_cci  :: CallCtxt    -- Whether *this* argument position is interesting
+                               --     plus demands and discount flags for *this* arg
+                               --          and further args
+                               --     So ai_dmds and ai_discs are never empty
       , sc_fun_ty :: OutType   -- Type of the function (f e1 .. en),
                                -- presumably (arg_ty -> res_ty)
                                -- where res_ty is expected by sc_cont
@@ -269,32 +271,52 @@ data ArgInfo
                                 -- or an enclosing one has rules (recursively)
                                 --      True => be keener to inline in all args
 
-        ai_strs :: [Bool],      -- Strictness of remaining arguments
+        ai_dmds :: [Demand],    -- Demands on remaining value arguments (beyond ai_args)
                                 --   Usually infinite, but if it is finite it guarantees
                                 --   that the function diverges after being given
                                 --   that number of args
-        ai_discs :: [Int]       -- Discounts for remaining arguments; non-zero => be keener to inline
+
+        ai_discs :: [Int]       -- Discounts for remaining value arguments (beyong ai_args)
+                                --   non-zero => be keener to inline
                                 --   Always infinite
     }
 
 data ArgSpec
   = ValArg { as_mult :: Mult
-           , as_arg :: OutExpr        -- Apply to this (coercion or value); c.f. ApplyToVal
+           , as_dmd  :: Demand        -- Demand placed on this argument
+           , as_arg  :: OutExpr       -- Apply to this (coercion or value); c.f. ApplyToVal
            , as_hole_ty :: OutType }  -- Type of the function (presumably t1 -> t2)
+
   | TyArg { as_arg_ty  :: OutType     -- Apply to this type; c.f. ApplyToTy
           , as_hole_ty :: OutType }   -- Type of the function (presumably forall a. blah)
+
   | CastBy OutCoercion                -- Cast by this; c.f. CastIt
 
+instance Outputable ArgInfo where
+  ppr (ArgInfo { ai_fun = fun, ai_args = args, ai_dmds = dmds })
+    = text "ArgInfo" <+> braces
+         (sep [ text "fun =" <+> ppr fun
+              , text "dmds =" <+> ppr dmds
+              , text "args =" <+> ppr args ])
+
 instance Outputable ArgSpec where
   ppr (ValArg { as_mult = mult, as_arg = arg })  = text "ValArg" <+> ppr mult <+> ppr arg
   ppr (TyArg { as_arg_ty = ty }) = text "TyArg" <+> ppr ty
   ppr (CastBy c)                 = text "CastBy" <+> ppr c
 
 addValArgTo :: ArgInfo -> (Mult, OutExpr) -> OutType -> ArgInfo
-addValArgTo ai (w, arg) hole_ty = ai { ai_args = arg_spec : ai_args ai
-                                     , ai_rules = decRules (ai_rules ai) }
-  where
-    arg_spec = ValArg { as_arg = arg, as_hole_ty = hole_ty, as_mult = w }
+addValArgTo ai (w, arg) hole_ty
+  | ArgInfo { ai_dmds = dmd:dmds, ai_discs = _:discs, ai_rules = rules } <- ai
+      -- Pop the top demand and and discounts off
+  , let arg_spec = ValArg { as_arg = arg, as_hole_ty = hole_ty
+                          , as_mult = w, as_dmd = dmd }
+  = ai { ai_args  = arg_spec : ai_args ai
+       , ai_dmds  = dmds
+       , ai_discs = discs
+       , ai_rules = decRules rules }
+  | otherwise
+  = pprPanic "addValArgTo" (ppr ai $$ ppr arg)
+    -- There should always be enough demands and discounts
 
 addTyArgTo :: ArgInfo -> OutType -> OutType -> ArgInfo
 addTyArgTo ai arg_ty hole_ty = ai { ai_args = arg_spec : ai_args ai
@@ -305,6 +327,12 @@ addTyArgTo ai arg_ty hole_ty = ai { ai_args = arg_spec : ai_args ai
 addCastTo :: ArgInfo -> OutCoercion -> ArgInfo
 addCastTo ai co = ai { ai_args = CastBy co : ai_args ai }
 
+isStrictArgInfo :: ArgInfo -> Bool
+-- True if the function is strict in the next argument
+isStrictArgInfo (ArgInfo { ai_dmds = dmds })
+  | dmd:_ <- dmds = isStrictDmd dmd
+  | otherwise     = False
+
 argInfoAppArgs :: [ArgSpec] -> [OutExpr]
 argInfoAppArgs []                              = []
 argInfoAppArgs (CastBy {}                : _)  = []  -- Stop at a cast
@@ -461,8 +489,8 @@ contArgs cont
   | otherwise = go [] cont
   where
     lone (ApplyToTy  {}) = False  -- See Note [Lone variables] in GHC.Core.Unfold
-    lone (ApplyToVal {}) = False
-    lone (CastIt {})     = False
+    lone (ApplyToVal {}) = False  -- NB: even a type application or cast
+    lone (CastIt {})     = False  --     stops it being "lone"
     lone _               = True
 
     go args (ApplyToVal { sc_arg = arg, sc_env = se, sc_cont = k })
@@ -489,17 +517,16 @@ mkArgInfo env fun rules n_val_args call_cont
   = ArgInfo { ai_fun = fun, ai_args = []
             , ai_rules = fun_rules
             , ai_encl = False
-            , ai_strs = vanilla_stricts
+            , ai_dmds = vanilla_dmds
             , ai_discs = vanilla_discounts }
   | otherwise
-  = ArgInfo { ai_fun = fun, ai_args = []
+  = ArgInfo { ai_fun   = fun
+            , ai_args = []
             , ai_rules = fun_rules
             , ai_encl  = interestingArgContext rules call_cont
-            , ai_strs  = arg_stricts
+            , ai_dmds  = add_type_strictness (idType fun) arg_dmds
             , ai_discs = arg_discounts }
   where
-    fun_ty = idType fun
-
     fun_rules = mkFunRules rules
 
     vanilla_discounts, arg_discounts :: [Int]
@@ -509,14 +536,14 @@ mkArgInfo env fun rules n_val_args call_cont
                               -> discounts ++ vanilla_discounts
                         _     -> vanilla_discounts
 
-    vanilla_stricts, arg_stricts :: [Bool]
-    vanilla_stricts  = repeat False
+    vanilla_dmds, arg_dmds :: [Demand]
+    vanilla_dmds  = repeat topDmd
 
-    arg_stricts
+    arg_dmds
       | not (sm_inline (seMode env))
-      = vanilla_stricts -- See Note [Do not expose strictness if sm_inline=False]
+      = vanilla_dmds -- See Note [Do not expose strictness if sm_inline=False]
       | otherwise
-      = add_type_str fun_ty $
+      = -- add_type_str fun_ty $
         case splitStrictSig (idStrictness fun) of
           (demands, result_info)
                 | not (demands `lengthExceeds` n_val_args)
@@ -529,36 +556,40 @@ mkArgInfo env fun rules n_val_args call_cont
                         -- inlining lone variables, so its ok
                         -- (see GHC.Core.Op.Simplify.Utils.analyseCont)
                    if isDeadEndDiv result_info then
-                        map isStrictDmd demands         -- Finite => result is bottom
+                        demands  -- Finite => result is bottom
                    else
-                        map isStrictDmd demands ++ vanilla_stricts
+                        demands ++ vanilla_dmds
                | otherwise
                -> WARN( True, text "More demands than arity" <+> ppr fun <+> ppr (idArity fun)
                                 <+> ppr n_val_args <+> ppr demands )
-                   vanilla_stricts      -- Not enough args, or no strictness
+                  vanilla_dmds      -- Not enough args, or no strictness
 
-    add_type_str :: Type -> [Bool] -> [Bool]
+    add_type_strictness :: Type -> [Demand] -> [Demand]
     -- If the function arg types are strict, record that in the 'strictness bits'
     -- No need to instantiate because unboxed types (which dominate the strict
     --   types) can't instantiate type variables.
-    -- add_type_str is done repeatedly (for each call);
+    -- add_type_strictness is done repeatedly (for each call);
     --   might be better once-for-all in the function
     -- But beware primops/datacons with no strictness
 
-    add_type_str _ [] = []
-    add_type_str fun_ty all_strs@(str:strs)
+    add_type_strictness fun_ty dmds
+      | null dmds = []
+
+      | Just (_, fun_ty') <- splitForAllTy_maybe fun_ty
+      = add_type_strictness fun_ty' dmds     -- Look through foralls
+
       | Just (_, arg_ty, fun_ty') <- splitFunTy_maybe fun_ty        -- Add strict-type info
-      = (str || Just False == isLiftedType_maybe arg_ty)
-        : add_type_str fun_ty' strs
+      , dmd : rest_dmds <- dmds
+      , let dmd' = case isLiftedType_maybe arg_ty of
+                       Just False -> strictenDmd dmd
+                       _          -> dmd
+      = dmd' : add_type_strictness fun_ty' rest_dmds
           -- If the type is levity-polymorphic, we can't know whether it's
           -- strict. isLiftedType_maybe will return Just False only when
           -- we're sure the type is unlifted.
 
-      | Just (_, fun_ty') <- splitForAllTy_maybe fun_ty
-      = add_type_str fun_ty' all_strs     -- Look through foralls
-
       | otherwise
-      = all_strs
+      = dmds
 
 {- Note [Unsaturated functions]
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -647,6 +678,26 @@ This made a small compile-time perf improvement in perf/compiler/T6048,
 and it looks plausible to me.
 -}
 
+lazyArgContext :: ArgInfo -> CallCtxt
+-- Use this for lazy arguments
+lazyArgContext (ArgInfo { ai_encl = encl_rules, ai_discs = discs })
+  | encl_rules                = RuleArgCtxt
+  | disc:_ <- discs, disc > 0 = DiscArgCtxt  -- Be keener here
+  | otherwise                 = BoringCtxt   -- Nothing interesting
+
+strictArgContext :: ArgInfo -> CallCtxt
+strictArgContext (ArgInfo { ai_encl = encl_rules, ai_discs = discs })
+-- Use this for strict arguments
+  | encl_rules                = RuleArgCtxt
+  | disc:_ <- discs, disc > 0 = DiscArgCtxt  -- Be keener here
+  | otherwise                 = RhsCtxt
+      -- Why RhsCtxt?  if we see f (g x) (h x), and f is strict, we
+      -- want to be a bit more eager to inline g, because it may
+      -- expose an eval (on x perhaps) that can be eliminated or
+      -- shared. I saw this in nofib 'boyer2', RewriteFuns.onewayunify1
+      -- It's worth an 18% improvement in allocation for this
+      -- particular benchmark; 5% on 'mate' and 1.3% on 'multiplier'
+
 interestingCallContext :: SimplEnv -> SimplCont -> CallCtxt
 -- See Note [Interesting call context]
 interestingCallContext env cont
@@ -663,7 +714,7 @@ interestingCallContext env cont
         -- motivation to inline. See Note [Cast then apply]
         -- in GHC.Core.Unfold
 
-    interesting (StrictArg { sc_cci = cci }) = cci
+    interesting (StrictArg { sc_fun = fun }) = strictArgContext fun
     interesting (StrictBind {})              = BoringCtxt
     interesting (Stop _ cci)                 = cci
     interesting (TickIt _ k)                 = interesting k
@@ -713,16 +764,13 @@ interestingArgContext rules call_cont
     go (Select {})                  = False
     go (ApplyToVal {})              = False  -- Shouldn't really happen
     go (ApplyToTy  {})              = False  -- Ditto
-    go (StrictArg { sc_cci = cci }) = interesting cci
+    go (StrictArg { sc_fun = fun }) = ai_encl fun
     go (StrictBind {})              = False      -- ??
     go (CastIt _ c)                 = go c
-    go (Stop _ cci)                 = interesting cci
+    go (Stop _ RuleArgCtxt)         = True
+    go (Stop _ _)                   = False
     go (TickIt _ c)                 = go c
 
-    interesting RuleArgCtxt = True
-    interesting _           = False
-
-
 {- Note [Interesting arguments]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 An argument is interesting if it deserves a discount for unfoldings
@@ -1201,9 +1249,9 @@ preInlineUnconditionally env top_lvl bndr rhs rhs_env
     extend_subst_with inl_rhs = extendIdSubst env bndr (mkContEx rhs_env inl_rhs)
 
     one_occ IAmDead = True -- Happens in ((\x.1) v)
-    one_occ OneOcc{ occ_one_br = InOneBranch
+    one_occ OneOcc{ occ_n_br   = 1
                   , occ_in_lam = NotInsideLam }   = isNotTopLevel top_lvl || early_phase
-    one_occ OneOcc{ occ_one_br = InOneBranch
+    one_occ OneOcc{ occ_n_br   = 1
                   , occ_in_lam = IsInsideLam
                   , occ_int_cxt = IsInteresting } = canInlineInLam rhs
     one_occ _                                     = False
@@ -1317,24 +1365,15 @@ postInlineUnconditionally env top_lvl bndr occ_info rhs
   | exprIsTrivial rhs           = True
   | otherwise
   = case occ_info of
-        -- The point of examining occ_info here is that for *non-values*
-        -- that occur outside a lambda, the call-site inliner won't have
-        -- a chance (because it doesn't know that the thing
-        -- only occurs once).   The pre-inliner won't have gotten
-        -- it either, if the thing occurs in more than one branch
-        -- So the main target is things like
-        --      let x = f y in
-        --      case v of
-        --         True  -> case x of ...
-        --         False -> case x of ...
-        -- This is very important in practice; e.g. wheel-seive1 doubles
-        -- in allocation if you miss this out
-      OneOcc { occ_in_lam = in_lam, occ_int_cxt = int_cxt }
-               -- OneOcc => no code-duplication issue
-        ->     smallEnoughToInline dflags unfolding     -- Small enough to dup
+      OneOcc { occ_in_lam = in_lam, occ_int_cxt = int_cxt, occ_n_br = n_br }
+        -- See Note [Inline small things to avoid creating a thunk]
+
+        -> n_br < 100  -- See Note [Suppress exponential blowup]
+
+           && smallEnoughToInline dflags unfolding     -- Small enough to dup
                         -- ToDo: consider discount on smallEnoughToInline if int_cxt is true
                         --
-                        -- NB: Do NOT inline arbitrarily big things, even if one_br is True
+                        -- NB: Do NOT inline arbitrarily big things, even if occ_n_br=1
                         -- Reason: doing so risks exponential behaviour.  We simplify a big
                         --         expression, inline it, and simplify it again.  But if the
                         --         very same thing happens in the big expression, we get
@@ -1381,7 +1420,56 @@ postInlineUnconditionally env top_lvl bndr occ_info rhs
     active    = isActive (sm_phase (getMode env)) (idInlineActivation bndr)
         -- See Note [pre/postInlineUnconditionally in gentle mode]
 
-{-
+{- Note [Inline small things to avoid creating a thunk]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The point of examining occ_info here is that for *non-values* that
+occur outside a lambda, the call-site inliner won't have a chance
+(because it doesn't know that the thing only occurs once).  The
+pre-inliner won't have gotten it either, if the thing occurs in more
+than one branch So the main target is things like
+
+     let x = f y in
+     case v of
+        True  -> case x of ...
+        False -> case x of ...
+
+This is very important in practice; e.g. wheel-seive1 doubles
+in allocation if you miss this out.  And bits of GHC itself start
+to allocate more.  An egregious example is test perf/compiler/T14697,
+where GHC.Driver.CmdLine.$wprocessArgs allocated hugely more.
+
+Note [Suppress exponential blowup]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In #13253, and several related tickets, we got an exponential blowup
+in code size from postInlineUnconditionally.  The trouble comes when
+we have
+  let j1a = case f y     of { True -> p;   False -> q }
+      j1b = case f y     of { True -> q;   False -> p }
+      j2a = case f (y+1) of { True -> j1a; False -> j1b }
+      j2b = case f (y+1) of { True -> j1b; False -> j1a }
+      ...
+  in case f (y+10) of { True -> j10a; False -> j10b }
+
+when there are many branches. In pass 1, postInlineUnconditionally
+inlines j10a and j10b (they are both small).  Now we have two calls
+to j9a and two to j9b.  In pass 2, postInlineUnconditionally inlines
+all four of these calls, leaving four calls to j8a and j8b. Etc.
+Yikes!  This is exponential!
+
+A possible plan: stop doing postInlineUnconditionally
+for some fixed, smallish number of branches, say 4. But that turned
+out to be bad: see Note [Inline small things to avoid creating a thunk].
+And, as it happened, the problem with #13253 was solved in a
+different way (Note [Duplicating StrictArg] in Simplify).
+
+So I just set an arbitrary, high limit of 100, to stop any
+totally exponential behaviour.
+
+This still leaves the nasty possiblity that /ordinary/ inlining (not
+postInlineUnconditionally) might inline these join points, each of
+which is individually quiet small.  I'm still not sure what to do
+about this (e.g. see #15488).
+
 Note [Top level and postInlineUnconditionally]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We don't do postInlineUnconditionally for top-level things (even for
diff --git a/compiler/GHC/Core/SimpleOpt.hs b/compiler/GHC/Core/SimpleOpt.hs
index afd915cf86..ab3eed4b60 100644
--- a/compiler/GHC/Core/SimpleOpt.hs
+++ b/compiler/GHC/Core/SimpleOpt.hs
@@ -433,7 +433,7 @@ simple_bind_pair env@(SOE { soe_inl = inl_env, soe_subst = subst })
     safe_to_inline IAmALoopBreaker{}                  = False
     safe_to_inline IAmDead                            = True
     safe_to_inline OneOcc{ occ_in_lam = NotInsideLam
-                         , occ_one_br = InOneBranch } = True
+                         , occ_n_br = 1 }             = True
     safe_to_inline OneOcc{}                           = False
     safe_to_inline ManyOccs{}                         = False
 
diff --git a/compiler/GHC/Types/Basic.hs b/compiler/GHC/Types/Basic.hs
index cf373f76d5..a0693b3f86 100644
--- a/compiler/GHC/Types/Basic.hs
+++ b/compiler/GHC/Types/Basic.hs
@@ -68,7 +68,7 @@ module GHC.Types.Basic (
         isNoOccInfo, strongLoopBreaker, weakLoopBreaker,
 
         InsideLam(..),
-        OneBranch(..),
+        BranchCount, oneBranch,
         InterestingCxt(..),
         TailCallInfo(..), tailCallInfo, zapOccTailCallInfo,
         isAlwaysTailCalled,
@@ -978,7 +978,7 @@ data OccInfo
                         -- lambda and case-bound variables.
 
   | OneOcc          { occ_in_lam  :: !InsideLam
-                    , occ_one_br  :: !OneBranch
+                    , occ_n_br    :: {-# UNPACK #-} !BranchCount
                     , occ_int_cxt :: !InterestingCxt
                     , occ_tail    :: !TailCallInfo }
                         -- ^ Occurs exactly once (per branch), not inside a rule
@@ -992,6 +992,16 @@ data OccInfo
 
 type RulesOnly = Bool
 
+type BranchCount = Int
+  -- For OneOcc, the BranchCount says how many syntactic occurrences there are
+  -- At the moment we really only check for 1 or >1, but in principle
+  --   we could pay attention to how *many* occurences there are
+  --   (notably in postInlineUnconditionally).
+  -- But meanwhile, Ints are very efficiently represented.
+
+oneBranch :: BranchCount
+oneBranch = 1
+
 {-
 Note [LoopBreaker OccInfo]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1058,14 +1068,6 @@ instance Monoid InsideLam where
   mappend = (Semi.<>)
 
 -----------------
-data OneBranch
-  = InOneBranch
-    -- ^ One syntactic occurrence: Occurs in only one case branch
-    -- so no code-duplication issue to worry about
-  | MultipleBranches
-  deriving (Eq)
-
------------------
 data TailCallInfo = AlwaysTailCalled JoinArity -- See Note [TailCallInfo]
                   | NoTailCallInfo
   deriving (Eq)
@@ -1124,12 +1126,10 @@ instance Outputable OccInfo where
           pp_ro | rule_only = char '!'
                 | otherwise = empty
   ppr (OneOcc inside_lam one_branch int_cxt tail_info)
-        = text "Once" <> pp_lam inside_lam <> pp_br one_branch <> pp_args int_cxt <> pp_tail
+        = text "Once" <> pp_lam inside_lam <> ppr one_branch <> pp_args int_cxt <> pp_tail
         where
           pp_lam IsInsideLam     = char 'L'
           pp_lam NotInsideLam    = empty
-          pp_br MultipleBranches = char '*'
-          pp_br InOneBranch      = empty
           pp_args IsInteresting  = char '!'
           pp_args NotInteresting = empty
           pp_tail                = pprShortTailCallInfo tail_info
@@ -1156,7 +1156,7 @@ AlwaysTailCalled.
 
 Note that there is a 'TailCallInfo' on a 'ManyOccs' value. One might expect that
 being tail-called would mean that the variable could only appear once per branch
-(thus getting a `OneOcc { occ_one_br = True }` occurrence info), but a join
+(thus getting a `OneOcc { }` occurrence info), but a join
 point can also be invoked from other join points, not just from case branches:
 
   let j1 x = ...
@@ -1167,7 +1167,7 @@ point can also be invoked from other join points, not just from case branches:
        C -> j2 q
 
 Here both 'j1' and 'j2' will get marked AlwaysTailCalled, but j1 will get
-ManyOccs and j2 will get `OneOcc { occ_one_br = True }`.
+ManyOccs and j2 will get `OneOcc { occ_n_br = 2 }`.
 
 ************************************************************************
 *                                                                      *
diff --git a/compiler/GHC/Types/Demand.hs b/compiler/GHC/Types/Demand.hs
index fd504eda30..51acdf3d8e 100644
--- a/compiler/GHC/Types/Demand.hs
+++ b/compiler/GHC/Types/Demand.hs
@@ -1285,14 +1285,14 @@ splitDmdTy ty@(DmdType _ [] res_ty)       = (defaultArgDmd res_ty, ty)
 deferAfterPreciseException :: DmdType -> DmdType
 deferAfterPreciseException = lubDmdType exnDmdType
 
-strictenDmd :: Demand -> CleanDemand
+strictenDmd :: Demand -> Demand
 strictenDmd (JD { sd = s, ud = u})
   = JD { sd = poke_s s, ud = poke_u u }
   where
-    poke_s Lazy      = HeadStr
-    poke_s (Str s)   = s
-    poke_u Abs       = UHead
-    poke_u (Use _ u) = u
+    poke_s Lazy      = Str HeadStr
+    poke_s s         = s
+    poke_u Abs       = useTop
+    poke_u u         = u
 
 -- Deferring and peeling
 
diff --git a/compiler/GHC/Types/Id/Info.hs b/compiler/GHC/Types/Id/Info.hs
index dfd6ef96ab..f67f581b74 100644
--- a/compiler/GHC/Types/Id/Info.hs
+++ b/compiler/GHC/Types/Id/Info.hs
@@ -58,7 +58,7 @@ module GHC.Types.Id.Info (
         isDeadOcc, isStrongLoopBreaker, isWeakLoopBreaker,
         occInfo, setOccInfo,
 
-        InsideLam(..), OneBranch(..),
+        InsideLam(..), BranchCount,
 
         TailCallInfo(..),
         tailCallInfo, isAlwaysTailCalled,