module Supercompile.Evaluator.Evaluate (normalise, step, gc, shouldExposeUnfolding) where

#include "HsVersions.h"

import Supercompile.Evaluator.Deeds
import Supercompile.Evaluator.FreeVars
import Supercompile.Evaluator.Residualise
import Supercompile.Evaluator.Syntax

import Supercompile.Core.Renaming
import Supercompile.Core.Syntax

import Supercompile.GHC (termToCoreExpr)
import Supercompile.StaticFlags
import Supercompile.Utilities

import qualified Data.Map as M

import qualified CoreSyn as CoreSyn
import CoreUnfold
import DynFlags (DynFlag(..), defaultDynFlags, dopt_set)
import Coercion (liftCoSubstWith, coercionKind, isReflCo, mkUnsafeCo)
import TyCon
import Type
import PrelRules
import Id
import Module
import Name (nameModule_maybe)
import IdInfo (isShortableIdInfo)
import DataCon
import Pair
import BasicTypes
import Demand (splitStrictSig, isBotRes)


-- FIXME: this doesn't really work very well if the Answers are indirections, which is a common case!
-- However, this is not so serious a problem since I started spotting manifest primops applications and
-- avoiding going through the wrapper to compile them.
evaluatePrim :: InScopeSet -> Tag -> PrimOp -> [Type] -> [Coerced Answer] -> Maybe (Anned (Coerced Answer))
evaluatePrim iss tg pop tys args = do
    args' <- fmap (map CoreSyn.Type tys ++) $ mapM to args
    (res:_) <- return [res | CoreSyn.BuiltinRule { CoreSyn.ru_nargs = nargs, CoreSyn.ru_try = f }
                          <- primOpRules pop (error "evaluatePrim: dummy primop name")
                      , nargs == length args
                      , Just res <- [f (const CoreSyn.NoUnfolding) args']]
    fmap (annedCoercedAnswer tg) $ fro res
  where
    to :: Coerced Answer -> Maybe CoreSyn.CoreExpr
    to (mb_co, (rn, v)) = fmap coerce $ case v of
        Literal l          -> Just (CoreSyn.Lit l)
        Coercion co        -> Just (CoreSyn.Coercion co)
        Data dc tys cos xs -> Just (CoreSyn.Var (dataConWrapId dc) `CoreSyn.mkTyApps` map (renameType iss rn) tys `CoreSyn.mkCoApps` cos `CoreSyn.mkVarApps` map (renameId rn) xs)
        _                  -> Nothing
      where
        -- It is quite important that we don't wrap things with spurious refl coercions when it comes
        -- to RULEs, because the default constant-folding rules don't trigger if there are too many coercions.
        -- Luckily, the coercion within a CastBy is guaranteed to be non-refl.
        coerce | Just co <- castByCo mb_co
               = (`CoreSyn.Cast` co)
               | otherwise
               = id

    fro :: CoreSyn.CoreExpr -> Maybe (Coerced Answer)
    fro (CoreSyn.Cast e co)   = fmap (\(mb_co', in_v) -> (castBy (maybe co (\co' -> mkTransCo iss co' co) (castByCo mb_co')) tg, in_v)) $ fro e
    fro (CoreSyn.Lit l)       = Just (Uncast, (emptyRenaming, Literal l))
    fro (CoreSyn.Coercion co) = Just (Uncast, (mkIdentityRenaming (tyCoVarsOfCo co), Coercion co))
    fro e = do (dc, univ_tys, e_args0) <- exprIsConApp_maybe (const CoreSyn.NoUnfolding) e
               case newTyConCo_maybe (dataConTyCon dc) of
                 Just co_axiom -> let [e_arg0] = e_args0 in fro (e_arg0 `CoreSyn.Cast` mkAxInstCo co_axiom univ_tys)
                 Nothing -> do
                   let (ex_tys, e_args1) = takeWhileJust toType_maybe     e_args0
                       (cos,    e_args2) = takeWhileJust toCoercion_maybe e_args1
                       (xs,     e_args3) = takeWhileJust toVar_maybe      e_args2
                   [] <- return e_args3
                   return (Uncast, (renamedValue (Data dc (univ_tys ++ ex_tys) cos xs)))
      where toType_maybe (CoreSyn.Type ty) = Just ty
            toType_maybe _                 = Nothing
            
            toCoercion_maybe (CoreSyn.Coercion co) = Just co
            toCoercion_maybe _                     = Nothing
            
            toVar_maybe (CoreSyn.Var x) = Just x
            toVar_maybe _               = Nothing

type ContextSummary = (Bool, [ArgSummary], CallCtxt)

summariseContext :: PureHeap -> Stack -> ContextSummary
summariseContext h k = trainCarFoldr go (True, [], BoringCtxt) k
  where go kf (lone_variable, arg_infos, cont_info) = case tagee kf of
          TyApply _         -> (lone_variable,           arg_infos, cont_info)
          CoApply _         -> (False, TrivArg         : arg_infos, ValAppCtxt)
          Apply x'          -> (False, summariseArg x' : arg_infos, ValAppCtxt)
          Scrutinise _ _ _  -> (True,                           [], CaseCtxt)
          PrimApply _ _ _ _ -> (True,                           [], ArgCtxt False)
          StrictLet _ _     -> (True,                           [], ArgCtxt False)
          Update _          -> (True,                           [], BoringCtxt)
          CastIt _          -> (lone_variable,           arg_infos, cont_info)

        summariseArg x' = case M.lookup x' h of
          Just hb | Just (_, e) <- heapBindingTerm hb -> if termIsValue e then ValueArg else NonTrivArg
          _ -> TrivArg

-- Can't use callSiteInline because it never inlines stuff with DFunUnfolding!
ghcHeuristics :: Id -> AnnedTerm {- try not to pull on this, it does a lot of work -}
              -> ContextSummary -> Bool
ghcHeuristics x e (lone_variable, arg_infos, cont_info)
  | isStrongLoopBreaker (idOccInfo x)
  = False -- NB: have to check this (not just use idUnfolding) because we might consider "e"
  | otherwise
  = (try (realIdUnfolding x) `mplus` try answer_unf) `orElse` trce (text "No unfolding") False
  where
    try unf = case unf of
      CoreSyn.CoreUnfolding { CoreSyn.uf_is_top = is_top, CoreSyn.uf_is_work_free = is_work_free, CoreSyn.uf_expandable = expandable
                            , CoreSyn.uf_arity = arity, CoreSyn.uf_guidance = guidance }
                               -> trce_fail (ppr (CoreSyn.uf_tmpl unf)) $
                                  Just $ tryUnfolding dflags1 x lone_variable 
                                                      arg_infos cont_info is_top 
                                                      is_work_free expandable
                                                      arity guidance
      -- GHC actually only looks through DFunUnfoldings in exprIsConApp_maybe,
      -- so I'll do this rough heuristic instead:
      CoreSyn.DFunUnfolding {} -> trce_fail (text "Unsaturated dictionary unfolding") $
                                  Just $ length arg_infos >= idArity x
      CoreSyn.NoUnfolding      -> Nothing
      CoreSyn.OtherCon {}      -> Nothing

    dflags0 = defaultDynFlags (error "ghcHeuristics: Settings in DynFlags used!")
    -- Set these two flags so that we get information about failed inlinings:
    dflags1 | tRACE     = dopt_set (dopt_set dflags0 Opt_D_verbose_core2core) Opt_D_dump_inlinings
            | otherwise = dflags0

    trce_fail :: SDoc -> Maybe Bool -> Maybe Bool
    --trce_fail doc (Just False) = trce doc (Just False)
    trce_fail _   mb_x         = mb_x

    trce :: SDoc -> a -> a
    trce | tRACE     = pprTrace ("Considering inlining: " ++ showSDoc (ppr x))
         | otherwise = flip const

    -- NB: I'm not particularly happy that I may have to make up a whole new unfolding at ever
    -- occurrence site, but GHC makes it hard to do otherwise because any binding with a non-stable
    -- Unfolding pinned to it gets the Unfolding zapped by GHC's renamer
    answer_unf = mkUnfolding CoreSyn.InlineRhs False False (termToCoreExpr (annedTermToTerm e))

-- | Non-expansive simplification we can do everywhere safely
--
-- Normalisation only ever releases deeds: it is *never* a net consumer of deeds. So normalisation
-- will never be impeded by a lack of deeds.
normalise :: UnnormalisedState -> State
normalise = snd . step' True . Right

-- | Possibly non-normalising simplification we can only do if we are allowed to by a termination test
--
-- Unlike normalisation, stepping may be a net consumer of deeds and thus be impeded by a lack of them.
step :: State -> Maybe State
step s = guard reduced >> return result
  where (reduced, result) = step' False $ Left s

step' :: Bool -> Either State UnnormalisedState -> (Bool, State) -- The flag indicates whether we managed to reduce any steps *at all*
step' normalising ei_state = {-# SCC "step'" #-}
    -- pprTrace "step'" (either (pPrintFullState quietStatePrettiness) (pPrintFullUnnormalisedState quietStatePrettiness) ei_state) $
    (\res@(_reduced, stepped_state) -> let _deeds = either releaseStateDeed releaseUnnormalisedStateDeed ei_state
                                           _doc = either (pPrintFullState quietStatePrettiness) (pPrintFullUnnormalisedState quietStatePrettiness) ei_state
                                           _fvs = either stateFreeVars unnormalisedStateFreeVars ei_state in
                                       ASSERT2(not dEEDS || noChange _deeds (releaseStateDeed stepped_state),
                                               hang (text "step': deeds lost or gained:") 2 (_doc $$ pPrintFullState quietStatePrettiness stepped_state))
                                       ASSERT2(subVarSet (stateFreeVars stepped_state) _fvs,
                                               text "step': FVs" $$ hang (text "Before:") 2 (pPrint _fvs $$ _doc) $$
                                                                    hang (text "After:")  2 (pPrint (stateFreeVars stepped_state) $$ pPrintFullState quietStatePrettiness stepped_state))
                                       -- traceRender (text "normalising" $$ nest 2 (pPrintFullUnnormalisedState state) $$ text "to" $$ nest 2 (pPrintFullState stepped_state)) $
                                       res) $
    go_entry ei_state
  where
    go_entry :: Either State UnnormalisedState -> (Bool, State)
    go_entry (Left (deeds, heap, k, anned_qa)) = case annee anned_qa of
      Question _ -> go_question (deeds, heap, k, fmap (\(Question x') -> x') anned_qa)
      Answer   _ -> go_answer   (deeds, heap, k, fmap (\(Answer a) -> a)     anned_qa)
    go_entry (Right state) = go state

    go :: UnnormalisedState -> (Bool, State)
    go (deeds, heap@(Heap h ids), k, (rn, e)) = case annee e of
        Var x            -> go_question (deeds, heap, k, fmap (\(rn, Var _)   -> renameId rn x) (renameAnned (rn, e)))
        Value v          -> go_answer   (deeds, heap, k, fmap (\(rn, Value _) -> (rn, v))       (renameAnned (rn, e)))
        TyApp e ty       -> go (deeds, heap,        Tagged tg (TyApply (renameType ids rn ty))                                   `Car` k, (rn, e))
        CoApp e co       -> go (deeds, heap,        Tagged tg (CoApply (renameCoercion ids rn co))                               `Car` k, (rn, e))
        App e x          -> go (deeds, heap,        Tagged tg (Apply (renameId rn x))                                            `Car` k, (rn, e))
        PrimOp pop tys es
          | (e:es) <- es -> go (deeds, heap,        Tagged tg (PrimApply pop (map (renameType ids rn) tys) [] (map ((,) rn) es)) `Car` k, (rn, e))
          | otherwise    -> pprPanic "step': nullary primops unsupported" (ppr pop)
        Case e x ty alts -> go (deeds, Heap h ids', Tagged tg (Scrutinise x' (renameType ids rn ty) (rn', alts))                 `Car` k, (rn, e))
          where (ids', rn', x') = renameNonRecBinder ids rn x
        Cast e co        -> go (deeds', heap,                                                                                         k', (rn, e))
          where (deeds', k') = appendCast deeds ids tg (renameCoercion ids rn co) k
        Let x e1 e2
          | isUnLiftedType (idType x) -> go (deeds,                  Heap h                                       ids', Tagged tg (StrictLet x' (rn', e2)) `Car` k, in_e1)
          | otherwise                 -> go (deeds `releaseDeeds` 1, Heap (M.insert x' (internallyBound in_e1) h) ids',                                          k, (rn', e2))
          where (ids', rn', (x', in_e1)) = renameNonRecBound ids rn (x, e1)
        LetRec xes e     -> go (deeds `releaseDeeds` 1, Heap (h `M.union` M.fromList [(x', internallyBound in_e) | (x', in_e) <- xes']) ids', k, (rn', e))
          where (ids', rn', xes') = renameBounds ids rn xes
      where tg = annedTag e

    appendCast deeds ids tg co' k
      | isReflCo final_co' = (deeds' `releaseDeeds` 1,                              k')
      | otherwise          = (deeds',                  Tagged tg (CastIt final_co') `Car` k')
      where
        -- Maintain the invariant that there are no adjacent casts in the stack
        (deeds', final_co', k') = case k of
          Tagged _old_tg (CastIt old_co') `Car` k' -> (deeds `releaseDeeds` 1, mkTransCo ids co' old_co', k')
          _                                        -> (deeds,                  co',                       k)

    -- TODO: in all the code below, I'm no longer sure whether we preserve deeds or not. In particular, the CastBy cases
    -- are a worry. But since the deeds stuff is probably on the way out I'm not trying to fix it right now.

    go_question (deeds, h, k, anned_x) = maybe (False, (deeds, h, k, fmap Question anned_x)) ((,) True) $ force  deeds h k (annedTag anned_x) (annee anned_x)
    go_answer   (deeds, h, k, anned_a) = maybe (False, (deeds, h, k, fmap Answer anned_a))   ((,) True) $ unwind deeds h k (annedTag anned_a) (annee anned_a)

    -- Deal with a variable at the top of the stack
    -- Might have to claim deeds if inlining a non-value non-internally-bound thing here
    -- FIXME: look inside unfoldings
    force :: Deeds -> Heap -> Stack -> Tag -> Out Var -> Maybe State
    force deeds (Heap h ids) k tg x'
      -- Try to trim the stack if the Id is guaranteed to bottom out after a certain number of arguments
      -- This is really amazingly important because so many case branches bottom out in at least one branch,
      -- and we can save supercompiling big case nests if we trim them out eagerly.
      --
      -- The supercompiled size of Bernouilli decreased from 19193 to 16173 with this change.
      -- FIXME: do this another way? (with freevars)
      | Just (ds, res_d) <- fmap splitStrictSig $ idStrictness_maybe x'
      , isBotRes res_d
      , Just (h_extra, k) <- trimUnreachable (length ds) (idType x') k
      = Just (deeds, Heap (h `M.union` h_extra) ids, k, fmap (\(Var x') -> Question x') (annedTerm tg (Var x'))) -- Kind of a hacky way to get an Anned Question!
      | otherwise = do
        -- NB: it doesn't matter if the thing we look up is superinlinable because we'll actually only need to check then when we come to
        -- look it up as a *value*. In any case, the vast majority of SUPERINLINABLE things will be values, so we'll never get here.
        hb <- M.lookup x' h
        in_e <- heapBindingTerm hb
        case termToCastAnswer ids in_e of
          -- Consider the case of non-internal bindings. We want to get at the values of them, but since we can't create updates we can
          -- only do so if they are already values. This is perhaps weird, but continuing using unwind' is exactly the right thing to do
          Just anned_cast_a -> do
            let deeds0 = releaseDeeds deeds 1 -- We can release the deed for the reference itself
            deeds1 <- if howBound hb == InternallyBound
                       then return deeds0
                       else claimDeeds deeds0 (annedSize anned_cast_a)
            unwind' deeds1 (Heap h ids) k (Just (x', (mkIdentityRenaming (unitVarSet x'), annedTerm tg (Var x'))))
                                          (annedToTagged anned_cast_a)
                                          (if dUPLICATE_VALUES_EVALUATOR then in_e else (mkIdentityRenaming (unitVarSet x'), fmap Var $ annedVar tg x'))
          Nothing -> do
            -- NB: we MUST NOT create update frames for non-concrete bindings!! This has bitten me in the past, and it is seriously confusing. 
            guard (howBound hb == InternallyBound)
            -- Avoid creating consecutive update frames: implements "stack squeezing" to maintain stack invariants
            -- NB: suprisingly this is not broken, even if there are cycles in the heap:
            --     <x |-> y, y |-> x | x |>
            -- --> <         y |-> x | y | update x>
            -- --> <         y |-> x | x | update x>
            --     The reason is that squeezing adds a heap binding that just points into an update frame bound thing,
            --     and once a binder is on the stack it won't be turned into a heap binder until we have got a value,
            --     so inlining the squeezed heap binding will either just get us stuck immediately or get to a value
            --
            -- NB: squeezing will discard any flag marking x' as superinlinable. However, I think this is totally OK:
            -- the update frame already on the stack (which is preserved by squeezing) contains the name by which the *user*
            -- tried to access the function, and it is in keeping with the rest of GHC (where the inlinability you see is
            -- based on the label you wrote in your own code) that that is the relevant flag.
            return $ normalise $ case (fst (peelUpdateStack k)) of
              Nothing                        -> (deeds, Heap (M.delete x' h)                            ids, Tagged tg (Update x') `Car` k, in_e)
              Just (cast_by, Tagged tg_y y') -> (deeds, Heap (M.insert x' (internallyBound in_e_ref) h) ids,                             k, in_e)
                where in_e_ref = mkVarCastBy tg_y y' cast_by

    -- TODO: this function totally ignores deeds
    trimUnreachable :: Int   -- Number of value arguments needed before evaluation bottoms out
                    -> Type  -- Type of the possibly-bottoming thing in the hole
                    -> Stack -- Stack consuming the hole
                    -> Maybe (PureHeap, -- Heap bindings arising from any update frames we trimmed off
                              Stack)    -- Trimmed stack (strictly "less" than the input one -- not necessarily shorter since we will replace e.g. a trailing Scrutinise with a Cast)
    trimUnreachable = go
      where
        -- Ran out of stack: even if n == 0 we don't want to
        -- trim the stack in these cases because we musn't return
        -- Just if the tail of the stack is already trivial: doing
        -- so would risk non-termination
        go _ _       (Loco _)                           = Nothing
        go _ _       (Tagged _ (CastIt _) `Car` Loco _) = Nothing
        -- Got some non-trivial stack that is unreachable due to bottomness: kill it (remembering to bind any updated stuff)
        go 0 hole_ty k@(Tagged cast_tg _ `Car` _) = Just $
          trainFoldl' (\(!hole_ty,    !h) (Tagged tg kf) -> (stackFrameType' kf hole_ty, case kf of Update x' -> M.insert x' (internallyBound (renamedTerm (annedTerm tg (Var x')))) h; _ -> h))
                      (\(!overall_ty, !h) gen -> (h, (if hole_ty `eqType` overall_ty then id else (Tagged cast_tg (CastIt (mkUnsafeCo hole_ty overall_ty)) `Car`)) $ Loco gen)) (hole_ty, M.empty) k
        -- Haven't yet reached a bottom, but we might get enough arguments to reach
        -- one in the future, so keep going
        go n hole_ty (kf `Car` k) = mb_n' >>= \n' -> liftM (second (kf `Car`)) $ go n' (stackFrameType kf hole_ty) k
          where mb_n' = case tagee kf of
                          TyApply _         -> Just n
                          CoApply _         -> Just (n - 1)
                          Apply _           -> Just (n - 1)
                          Scrutinise _ _ _  -> Nothing
                          PrimApply _ _ _ _ -> Nothing
                          StrictLet _ _     -> Nothing
                          Update _          -> Just n
                          CastIt _          -> Just n

    -- Deal with a value at the top of the stack
    unwind :: Deeds -> Heap -> Stack -> Tag -> Answer -> Maybe State
    unwind deeds heap k tg_a a = unwind' deeds heap k Nothing (Tagged tg_a (Uncast, a)) (taggedAnswerToInAnnedTerm (Tagged tg_a a))

    unwind' :: Deeds -> Heap -> Stack
            -> Maybe (Var,             -- Variable name of thing in focus, for normalisation check/SUPERINLINABLE purposes
                      In AnnedTerm)    -- Version of the focus that should go into *focus* (useful if we are reducing <x = v | x | update y >, we want x in the focus in next step, not y!)
            -> Tagged (Coerced Answer) -- "Meaning" of the focus, in value terms
            -> In AnnedTerm            -- Version of the focus that should go into heap (may just be a variable reference to a value)
            -> Maybe State
    unwind' deeds heap@(Heap h ids) k mb_x' (Tagged tg_a cast_a@(mb_co, (rn, v))) in_e_heap = case k of
     Loco _ -> Nothing
     Car kf k -> case tagee kf of
        TyApply ty'                    ->                  tyApply    (deeds `releaseDeeds` 1)          ty'
        CoApply co'                    ->                  coApply    (deeds `releaseDeeds` 1)          co'
        Apply x2'                      ->                  apply      deeds                    (tag kf) x2'
        Scrutinise x' ty' in_alts      -> fmap normalise $ scrutinise (deeds `releaseDeeds` 1)          x' ty' in_alts
        PrimApply pop tys' in_vs in_es -> fmap normalise $ primop     deeds                    (tag kf) pop tys' in_vs in_es
        StrictLet x' in_e2             -> fmap normalise $ strictLet  (deeds `releaseDeeds` 1)          x' in_e2
        -- NB: since there are never two adjacent casts on the stack, our reference expressions will always have at most 1 coercion
        CastIt co'                     -> unwind' deeds heap k (fmap (second cast_e) mb_x') (castAnswer ids (tag kf) co' (Tagged tg_a cast_a)) (cast_e in_e_heap) -- TODO: can potentially release deeds from cast_a double-cast here
          where cast_e in_e = (mkInScopeIdentityRenaming ids, annedTerm (tag kf) (renameIn (renameAnnedTerm ids) in_e `Cast` co'))
        Update x'                      -> do
            -- If duplicating values, we ensure normalisation by not executing updates
            guard (not normalising || not dUPLICATE_VALUES_EVALUATOR)
            return $ normalise (deeds', Heap (M.insert x' (internallyBound in_e_heap) h) ids, k, in_e')
          where Just deeds' = claimDeeds (deeds `releaseDeeds` castAnswerSize cast_a) (annedTermSize (snd in_e_heap))
                in_e' = case mb_x' of
                          Nothing              -> (mkIdentityRenaming (unitVarSet x'), fmap Var $ annedVar (tag kf) x')
                          Just (_, in_e_focus) -> in_e_focus
      where
        -- Should check this is not Nothing before using the "meaning" a
        checkShouldExposeUnfolding = case mb_x' of
                     -- FIXME: if we eliminate "dead" update frames, we should return True here or else WAY TOO MUCH will be treated as superinlinable
                     Nothing -> Just True
                     -- We have to check this here (not just when preparing unfoldings) because
                     -- we would like to also exclude local recursive loops, not just top-level ones
                     Just (x', _) -> case shouldExposeUnfolding x' of
                       Right super  -> Just super
                       Left why_not -> pprTrace "Unavailable:" (ppr x' <+> parens (text why_not)) $
                                       fail why_not

        -- NB: assumes that ei_state has the same size as what we would return from "step" if the dereferencing fails.
        -- At the time of writing, this invariant holds (since turning terms into equivalent stack frames doesn't change size)
        checkLambdaish :: UnnormalisedState -> Maybe State
        checkLambdaish s'_unnormalised
          -- If not duplicating values, we ensure normalisation by not executing applications to non-explicit-lambdas
          | normalising, isJust mb_x', not dUPLICATE_VALUES_EVALUATOR = Nothing
          | otherwise = do
            super <- checkShouldExposeUnfolding
            let s' = normalise s'_unnormalised
            -- NB: you might think it would be OK to inline even when *normalising*, as long as the inlining
            -- makes the term strictly smaller. This is very tempting, but we would have to check that the size
            -- measure decreased by the inlining is the same as the size measure needed to prove normalisation of
            -- the reduction system, and I'm too lazy to do that right now.
            --
            -- NB: it is hard to get a size-decrease check like this to work well. Consider:
            --  bindIO getArgs (\x -> e)
            -- This has 2 applications and one lambda (I've omitted all casts, since they cost 0). If we inline we get:
            --  \s -> case getArgs s of (# x, s #) -> e s
            -- This has 2 applications, one lambda, and ONE CASE EXPRESSION. So things must have got worse! But bindIO with
            -- one known argument is almost a canonical example of the kind of thing we *would* like to inline, and the example
            -- above assumes perfect reduction: note in particular that we've beta-reduced the (\x -> e) away entirely (and
            -- done so within a case branch), so to even get this far we need "deep normalisation" *and* inlining used-once lams
            -- as part of this check.
            --
            -- Perhaps if we had deep normalisation + GC we could get these results by penalising heap allocation heavily?
            -- If so we must remember to do it for heap bindings *and* letrecs.
            let k_summary = summariseContext h (Car kf k)
            guard $ case () of
              _ -- If the lambda is marked SUPERINLINABLE, always inline it
                | super
                -> True
                -- If inlining gets us to a value, accept it. This is a bit ad-hoc for two reasons:
                --  1. We might reach a value now, and then later apply two more arguments to effectively unconditionally inline a full application
                --  2. We might need to do another round of inlining to actually expose a value e.g. g in (f = \x y -> e; g = \x -> f x; h = g x)
                -- However, this is important so that speculation is able to turn partial applications into explicit values
                | (_, _, k, qa) <- s'
                , Answer _ <- annee qa
                , Just _ <- isCastStack_maybe k -- Might be a trailing cast
                -> True
                -- If the result is actually smaller, accept it (this catches manifest values)
                -- NB: garbage collect before comparison in case we inlined an internallyBound thing
                -- into its only use site, which is a very important case to catch!
                -- TODO: should gc the unnormalised state as well.. but risks non-termination?
                | stateSize (gc s') <= either (stateSize . gc) unnormalisedStateSize ei_state
                -> True
                -- It might still be OK to get larger if GHC's inlining heuristics say we should
                | Just (x', _) <- mb_x'
                , ghcHeuristics x' (annedTerm tg_a (coercedAnswerToAnnedTerm' ids cast_a)) k_summary -- NB: the tag is irrelevant
                -> True
                -- Otherwise, we don't want to beta-reduce
                | otherwise
                -> pprTrace "Unwanted:" (pPrint (coercedAnswerToAnnedTerm' ids cast_a)) False
            (case mb_x' of Just (x', _) -> pprTrace "Inlining" (ppr x'); Nothing -> id) $
              return s'

        tyApply :: Deeds -> Out Type -> Maybe State
        tyApply deeds0 ty' = do
          TyLambda x e_body <- return v
          checkLambdaish $
            let deeds1 = deeds0 `releaseDeeds` 1 -- Release deed associated with the lambda (but not its body)
                (deeds2, k') = case mb_co of Uncast            -> (deeds1, k)
                                             CastBy co' _tg_co -> appendCast deeds1 ids tg_a (co' `mk_inst` ty') k
            in (deeds2, heap, k', (insertTypeSubst rn x ty', e_body))
          where mk_inst = mkInstCo ids

        coApply :: Deeds -> Out Coercion -> Maybe State
        coApply deeds0 apply_co' = do
          Lambda x e_body <- return v
          checkLambdaish $
            let deeds1 = deeds0 `releaseDeeds` 1 -- Release deed associated with the lambda (but not its body)
            in case mb_co of
                 Uncast            -> (deeds1, heap, k,  (insertCoercionSubst rn x apply_co',      e_body))
                 CastBy co' _tg_co -> (deeds2, heap, k', (insertCoercionSubst rn x cast_apply_co', e_body))
                   where -- Implements the special case of beta-reduction of cast lambda where the argument is an explicit coercion value.
                         -- You can derive this rule from the rules in "Practical aspects of evidence-based compilation" by combining:
                         --  1. TPush, to move the co' from the lambda to the argument and result (arg_co' and res_co')
                         --  2. The rules in Figure 5, to replace a cast of a coercion value with a simple coercion value
                         --  3. The fact that nth commutes with sym to clean up the result (can be proven from Figure 4)
                         (arg_co', res_co') = (mkNthCo 0 co', mkNthCo 1 co')
                         (arg_from_co', arg_to_co') = (mkNthCo 0 arg_co', mkNthCo 1 arg_co')
                         cast_apply_co' = arg_from_co' `mk_trans` apply_co' `mk_trans` mk_sym arg_to_co'
                         mk_trans = mkTransCo ids
                         mk_sym = mkSymCo ids
                         -- Maintain the no-adjacent-casts invariant
                         (deeds2, k') = appendCast deeds1 ids tg_a res_co' k

        apply :: Deeds -> Tag -> Out Var -> Maybe State
        apply deeds0 tg_kf x' = do
          Lambda x e_body <- return v
          checkLambdaish $ case mb_co of
              Uncast -> (deeds1, Heap h ids, k, (insertIdRenaming rn x x', e_body))
                where deeds1 = deeds0 `releaseDeeds` 2 -- Release deed associated with the lambda (but not its body), AND that from the stack frame
              CastBy co' _tg_co -> (deeds1, Heap (M.insert y' (internallyBound (renamedTerm e_arg)) h) ids', k', (rn', e_body))
                where (ids', rn', y') = renameNonRecBinder ids rn (x `setIdType` arg_co_from_ty')
                      Pair arg_co_from_ty' _arg_co_to_ty' = coercionKind arg_co'
                      (arg_co', res_co') = (mkNthCo 0 co', mkNthCo 1 co')
                      e_arg = annedTerm tg_a (annedTerm tg_kf (Var x') `Cast` mkSymCo ids arg_co')
                      (deeds1, k') = appendCast deeds0 ids tg_a res_co' k -- Might release a deed if the final body coercion is refl

        -- TODO: use checkShouldExposeUnfolding
        scrutinise :: Deeds -> Out Var -> Out Type -> In [AnnedAlt] -> Maybe UnnormalisedState
        scrutinise deeds_init wild' _ty' (rn_alts, alts)
           -- Literals are easy -- we can make the simplifying assumption that the types of literals are
           -- always simple TyCons without any universally quantified type variables.
          | Literal l <- v
          , case mb_co_kind of Nothing -> True; Just (_, _, Pair from_ty' to_ty') -> from_ty' `eqType` to_ty' -- NB: should never see refl here!
          , (deeds2, alt_e):_ <- [(deeds1 `releaseDeeds` annedAltsSize rest, (rn_alts, alt_e)) | ((LiteralAlt alt_l, alt_e), rest) <- bagContexts alts, alt_l == l]
          = Just (deeds2, Heap h1 ids, k, alt_e)
          
           -- Data is a big stinking mess! I hate you, KPush rule.
          | Data dc tys cos xs <- v
           -- a) Ensure that the coercion on the data (if any) lets us do the reduction, and determine
           --    the appropriate coercions to use (if any) on each value argument to the DataCon
          , Just mb_dc_cos <- case mb_co_kind of
                                    Nothing -> return Nothing
                                    Just (co', tg_co, Pair from_ty' to_ty') -> do
                                      (from_tc, _from_tc_arg_tys') <- splitTyConApp_maybe from_ty'
                                      (to_tc,   _to_tc_arg_tys')   <- splitTyConApp_maybe to_ty'
                                      guard $ from_tc == to_tc
                                      return $ Just $
                                        let -- Substantially copied from CoreUnfold.exprIsConApp_maybe:
                                            tc_arity       = tyConArity from_tc
                                            dc_univ_tyvars = dataConUnivTyVars dc
                                            dc_ex_tyvars   = dataConExTyVars dc
                                            arg_tys        = dataConRepArgTys dc
                                        
                                            -- Make the "theta" from Fig 3 of the paper
                                            (_univ_tys, ex_tys) = splitAt tc_arity tys
                                            gammas = decomposeCo tc_arity co'
                                            theta_subst = ASSERT2(length dc_univ_tyvars == tc_arity,      ppr dc $$ ppr dc_univ_tyvars $$ ppr tc_arity)
                                                          ASSERT2(length dc_ex_tyvars   == length ex_tys, ppr dc $$ ppr dc_ex_tyvars $$ ppr (length ex_tys))
                                                          liftCoSubstWith (dc_univ_tyvars ++ dc_ex_tyvars)
                                                                          (gammas         ++ map mkReflCo ex_tys)
                                        in map (\arg_ty -> (theta_subst arg_ty, tg_co)) arg_tys -- Use tag from the original coercion everywhere
           -- b) Identify the first appropriate branch of the case and reduce -- apply the discovered coercions if necessary
          , (deeds3, h', ids', alt_e):_ <- [ res
                                           | ((DataAlt alt_dc alt_as alt_qs alt_xs, alt_e), rest) <- bagContexts alts
                                           , alt_dc == dc
                                           , let tys' = map (renameType ids rn) tys
                                                 cos' = map (renameCoercion ids rn) cos
                                                 xs' = map (renameId rn) xs
                                                 rn_alts' = insertTypeSubsts rn_alts (alt_as `zip` tys')
                                                 deeds2 = deeds1 `releaseDeeds` annedAltsSize rest
                                           , Just res <- [do (deeds3, h', ids', rn_alts') <- case mb_dc_cos of
                                                               Nothing     -> return (deeds2, h1, ids, insertIdRenamings (insertCoercionSubsts rn_alts' (alt_qs `zip` cos')) (alt_xs `zip` xs'))
                                                               Just dc_cos -> foldM (\(deeds, h, ids, rn_alts) (uncast_e_arg', alt_y, (dc_co, tg_co)) ->
                                                                                        let Pair _dc_co_from_ty' dc_co_to_ty' = coercionKind dc_co -- TODO: use to_tc_arg_tys' from above?
                                                                                            (ids', rn_alts', y') = renameNonRecBinder ids rn_alts (alt_y `setIdType` dc_co_to_ty')
                                                                                            e_arg = annedTerm tg_co $ annedTerm tg_a uncast_e_arg' `Cast` dc_co
                                                                                        in fmap (\deeds' -> (deeds', M.insert y' (internallyBound (renamedTerm e_arg)) h, ids', rn_alts')) $ claimDeeds deeds (annedSize e_arg))
                                                                                    (deeds2, h1, ids, rn_alts') (zip3 (map (Value . Coercion) cos' ++ map Var xs') (alt_qs ++ alt_xs) dc_cos)
                                                             return (deeds3, h', ids', (rn_alts', alt_e))]
                                           ]
          = Just (deeds3, Heap h' ids', k, alt_e)
          
           -- Thank god, default alternatives are trivial:
          | (deeds2, alt_e):_ <- [(deeds1 `releaseDeeds` annedAltsSize rest, (rn_alts, alt_e)) | ((DefaultAlt, alt_e), rest) <- bagContexts alts]
          = Just (deeds2, Heap h1 ids, k, alt_e)
          
           -- This can legitimately occur, e.g. when supercompiling (if x then (case x of False -> 1) else 2)
          | otherwise
          = Nothing
          where mb_co_kind = case mb_co of
                               Uncast          -> Nothing
                               CastBy co tg_co -> Just (co, tg_co, coercionKind co)
                deeds0 = deeds_init `releaseDeeds` castAnswerSize cast_a
                (deeds1, h1) | isDeadBinder wild' = (deeds0,                         h)
                             | otherwise          = (deeds0', M.insert wild' wild_hb h)
                               where wild_hb = internallyBound in_e_heap
                                     Just deeds0' = claimDeeds deeds0 (annedTermSize (snd in_e_heap))
                                     -- NB: we add the *non-dereferenced* value to the heap for a case wildcard, because anything else may duplicate allocation

        -- NB: this actually duplicates the answer "a" into the answers field of the PrimApply, even if that "a" is update-bound
        -- This isn't perhaps in the spirit of the rest of the evaluator, but it probably doesn't matter at all in practice.
        -- TODO: use checkShouldExposeUnfolding?
        primop :: Deeds -> Tag -> PrimOp -> [Out Type] -> [Anned (Coerced Answer)] -> [In AnnedTerm] -> Maybe UnnormalisedState
        primop deeds tg_kf pop tys' anned_as [] = do
            guard eVALUATE_PRIMOPS -- NB: this is not faithful to paper 1 because we still turn primop expressions into
                                   -- stack frames.. this is bad because it will impede good specilations (without smart generalisation)
            let tg_kf' = tg_kf { tagOccurrences = if oCCURRENCE_GENERALISATION then tagOccurrences tg_kf + sum (map tagOccurrences (tg_a : map annedTag anned_as)) else 1 }
            anned_cast_a' <- evaluatePrim ids tg_kf' pop tys' (map annee anned_as ++ [cast_a])
            deeds <- claimDeeds (deeds `releaseDeeds` (sum (map annedSize anned_as) + castAnswerSize cast_a + 1)) (annedSize anned_cast_a') -- I don't think this can ever fail
            return (deeds, heap, k, taggedCastAnswerToInAnnedTerm ids (annedToTagged anned_cast_a'))
        primop deeds tg_kf pop tys' anned_as in_es = case in_es of
            (in_e:in_es) -> Just (deeds, heap, Tagged tg_kf (PrimApply pop tys' (anned_as ++ [annedCoercedAnswer tg_a cast_a]) in_es) `Car` k, in_e)
            []           -> Nothing

        strictLet :: Deeds -> Out Var -> In AnnedTerm -> Maybe UnnormalisedState
        strictLet deeds x' in_e2 = Just (deeds', Heap (M.insert x' (internallyBound in_e_heap) h) ids, k, in_e2)
          where Just deeds' = claimDeeds (deeds `releaseDeeds` castAnswerSize cast_a) (annedTermSize (snd in_e_heap))


-- We don't want to expose an unfolding if it would not be inlineable in the initial phase.
-- This gives normal RULES more of a chance to fire.
--
-- NB: this only controls whether a particular definition is available for inlining at all.
-- For the inlining to actualy happen we also make a check at the call-site that considers
-- whether it is benefical to inline the definition into the particular context we find.
--
-- NB: a place where these heuristics really hurt is:
--   {-# INLINE [0] foldr #-}
--   foldr k z = go
--             where
--               go []     = z
--               go (y:ys) = y `k` go ys
--
-- Although foldr gets inlined by the supercompiler, "go" is a local recursive loop which
-- doesn't get inlined at all. See also:
--  {-# SUPERINLINABLE sum #-}
--  sum     l       = sum' l 0
--    where
--      sum' []     a = a
--      sum' (x:xs) a = sum' xs (a+x)
--
-- A possible solution is to mark all those Ids syntactically contained within a SUPERINLINABLE
-- unfolding as SUPERINLINABLE if they are not explicitly INLINE/NOINLINE. We can do this when
-- we construct the unfoldings in the first place.
shouldExposeUnfolding :: Id -> Either String Superinlinable
shouldExposeUnfolding x = case inl_inline inl_prag of
    -- FIXME: God help my soul
    _ | Just mod <- nameModule_maybe (idName x)
      , moduleName mod `elem` map mkModuleName [
          "Data.Complex", "GHC.List",
          "QSort", -- awards
          "Checker", "Lisplikefns", "Rewritefns", "Rulebasetext", -- boyer2
          "Auxil", "Interval", "Key", "Prog", -- cichelli
          "MonadState", "MonadTrans", -- cryptarithm2
          "StateMonad", -- cse
          "Knowledge", "Result", "Search", "Table", "Match", -- expert
          "Fourier", "Complex_Vectors", -- fft2
          "RA", "RC", "RG", "RU", "Types", -- nucleic2
          "ChessSetArray", "ChessSetList", "KnightHeuristic", "Queue", "Sort", -- knights
          "Mandel", -- mandel
          "Move", "Problem", "Solution", -- mate
          "Board", "Game", "Prog", "Tree", "Wins", -- minimax
          "CharSeq", "Pretty", -- pretty
          "IntLib", "MyRandom", "Prime", -- primetest
          "Digraph" -- scc
        ]
      -> Right True
    -- These get wrappers generated for them: be very eager to inline the wrappers
      | isPrimOpId x || isDataConWorkId x
      -> Right True
    -- NB: we don't check the activation on INLINE things because so many activations
    -- are used to ensure that e.g. RULE-based fusion works properly, and NOINLINE will
    -- generally impede supercompiler-directed fusion.
    --
    -- Our philosophy: if it is *ever* inlinable (in any phase), expose it
    Inline                                 -> Right True -- Don't check for size increase at all if marked INLINE
    Inlinable super
      | only_if_superinlinable, not super  -> Left "INLINEABLE but not SUPERINLINABLE"
      | otherwise                          -> Right super
    NoInline
      | isNeverActive (inl_act inl_prag)   -> Left "unconditional NONLINE"
      | only_if_superinlinable             -> Left "conditional NOINLINE, not SUPERINLINABLE"
    EmptyInlineSpec 
      | only_if_superinlinable             -> Left "not SUPERINLINABLE"
    _                                      -> Right False
  where inl_prag = idInlinePragma x
        -- EXPERIMENT: only respect the SUPERINLINABLE distinction on *loop breakers*
        -- The motivation is that we don't really want to go around annotating (GHC.Base.>>=),
        -- bindIO, etc etc as SUPERINLINABLE.
        only_if_superinlinable = case sUPERINLINABILITY of
          ForRecursion  -> isStrongLoopBreaker (idOccInfo x)
          ForEverything -> True
          ForNothing    -> False


-- We used to garbage-collect in the evaluator, when we executed the rule for update frames. This had two benefits:
--  1) We don't have to actually update the heap or even claim a new deed
--  2) We make the supercompiler less likely to terminate, because GCing so tends to reduce TagBag sizes
--
-- However, this caused problems with speculation: to prevent incorrectly garbage collecting bindings from the invisible "enclosing"
-- heap when we speculated one of the bindings from the heap, we had to pass around an extra "live set" of parts of the heap that might
-- be referred to later on. Furthermore:
--  * Finding FVs when executing every update step was a bit expensive (though they were memoized on each of the State components)
--  * This didn't GC cycles (i.e. don't consider stuff from the Heap that was only referred to by the thing being removed as "GC roots")
--  * It didn't seem to make any difference to the benchmark numbers anyway
--
-- You might think a good alternative approach is to:
-- 1. Drop dead update frames in transitiveInline (which is anyway responsible for ensuring there is no dead stuff in the stack)
-- 2. "Squeeze" just before the matcher: this shorts out indirections-to-indirections and does update-frame stack squeezing.
--    You might also think that it would be cool to just do this in normalisation, but then when normalising during specualation the enclosing
--    context wouldn't get final_rned :-(
--
-- HOWEVER. That doesn't work properly because normalisation itself can introduce dead bindings - i.e. in order to be guaranteed to
-- catch all the junk we have to GC normalised bindings, not the pre-normalised ones that transitiveInline sees. So instead I did
-- both points 1 and 2 right just before we go to the matcher.
--
-- HOWEVER. Simon suggested something that made me realise that actually we could do squeezing of consecutive update frames and
-- indirection chains in the evaluator (and thus the normaliser) itself, which is even cooler. Thus all that is left to do in the
-- GC is to make a "global" analysis that drops stuff that is definitely dead. We *still* want to run this just before the matcher because
-- although dead heap bindings don't bother it, it would be confused by dead update frames.
--
-- TODO: have the garbage collector collapse (let x = True in x) to (True) -- but note that this requires onceness analysis
gc :: State -> State
gc _state@(deeds0, Heap h ids, k, in_e)
  = {-# SCC "gc" #-}
    ASSERT2(stateUncoveredVars gced_state `subVarSet` stateUncoveredVars _state, ppr (stateUncoveredVars gced_state, PrettyDoc (pPrintFullState quietStatePrettiness _state), PrettyDoc (pPrintFullState quietStatePrettiness gced_state)))
    gced_state -- We do not insist that *no* variables are uncovered because when used from the speculator this may not be true
  where
    gced_state = (deeds2, Heap h' ids, k', in_e)
    
    -- We have to use stateAllFreeVars here rather than stateFreeVars because in order to safely prune the live stack we need
    -- variables bound by k to be part of the live set if they occur within in_e or the rest of the k
    live0 = stateAllFreeVars (deeds0, Heap M.empty ids, k, in_e)
    (deeds1, h', live1) = inlineLiveHeap deeds0 h live0
    -- Collecting dead update frames doesn't make any new heap bindings dead since they don't refer to anything
    (deeds2, k') | False     = pruneLiveStack deeds1 k live1
                 | otherwise = (deeds1, k) -- FIXME: turned this off for now because it means that the resulting term might not be normalised (!!!)
                                           -- NB: if you change this check out checkShouldExposeUnfolding as well
    
    inlineLiveHeap :: Deeds -> PureHeap -> FreeVars -> (Deeds, PureHeap, FreeVars)
    inlineLiveHeap deeds h live = (foldr (flip releaseHeapBindingDeeds . snd) deeds h_dead_kvs, h_live, live')
      where
        (h_dead_kvs, h_live, live') = heap_worker (M.toAscList h) M.empty live
        
        -- This is just like Split.transitiveInline, but simpler since it never has to worry about running out of deeds:
        heap_worker :: [(Var, HeapBinding)]   -- Possibly-dead heap as sorted list. NB: not a PureHeap map because creating..
                    -> PureHeap -> FreeVars   -- ..the map on every iteration showed up as 6% of SC allocations!
                    -> ([(Var, HeapBinding)], -- Dead heap
                        PureHeap, FreeVars)
        heap_worker h_pending h_output live
          = if live == live'
            then (h_pending', h_output', live')
            else heap_worker h_pending' h_output' live'
          where 
            (h_pending', h_output', live') = foldr consider_inlining ([], h_output, live) h_pending
        
            -- NB: It's important that type variables become live after inlining a binding, or we won't
            -- necessarily lambda-abstract over all the free type variables of a h-function
            consider_inlining (x', hb) (h_pending_kvs, h_output, live)
              | x' `elemVarSet` live = (h_pending_kvs,            M.insert x' hb h_output, live `unionVarSet` heapBindingFreeVars hb `unionVarSet` varBndrFreeVars x')
              | otherwise            = ((x', hb) : h_pending_kvs, h_output,                live)
    
    -- NB: doing this is cool yet also dangerous at the same time. What if we have:
    --  {-# NOINLINE foo #-}
    --  foo = \x -> e
    --
    --  root = case foo 100 of \Delta
    --
    -- After normalisation + GCing (including dropping dead update frames) we will basically get:
    --  case (\x -> e) 100 of \Delta
    --
    -- So this is really bad because we have lost the NOINLINE information!
    -- Of course, this is also sometimes cool because it turns non-normalising beta-reductions into manifestly normalising ones.
    --
    -- My compromise is to allow dumping only those binders with "shortable" IdInfo, where shortability
    -- is a notion stolen from GHCs simplifier.
    --
    -- TODO: perhaps this same check should be applied in the Update frame compressor, though that would destroy some stack invariants
    pruneLiveStack :: Deeds -> Stack -> FreeVars -> (Deeds, Stack)
    pruneLiveStack init_deeds k live = trainFoldr (\kf (deeds, k_live) -> if (case tagee kf of Update x' | isShortableIdInfo (idInfo x') -> x' `elemVarSet` live; _ -> True)
                                                                          then (deeds, kf `Car` k_live)
                                                                          else (deeds `releaseStackFrameDeeds` kf, k_live))
                                                  (\gen deeds -> (deeds, Loco gen)) init_deeds k