//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "CSPreInliner.h" #include "llvm/ADT/SCCIterator.h" #include #include #define DEBUG_TYPE "cs-preinliner" using namespace llvm; using namespace sampleprof; static cl::opt EnableCSPreInliner( "csspgo-preinliner", cl::Hidden, cl::init(false), cl::desc("Run a global pre-inliner to merge context profile based on " "estimated global top-down inline decisions")); // The switches specify inline thresholds used in SampleProfileLoader inlining. // TODO: the actual threshold to be tuned here because the size here is based // on machine code not LLVM IR. extern cl::opt SampleHotCallSiteThreshold; extern cl::opt SampleColdCallSiteThreshold; extern cl::opt ProfileInlineGrowthLimit; extern cl::opt ProfileInlineLimitMin; extern cl::opt ProfileInlineLimitMax; static cl::opt SamplePreInlineReplay( "csspgo-replay-preinline", cl::Hidden, cl::init(false), cl::desc( "Replay previous inlining and adjust context profile accordingly")); CSPreInliner::CSPreInliner(StringMap &Profiles, uint64_t HotThreshold, uint64_t ColdThreshold) : ContextTracker(Profiles), ProfileMap(Profiles), HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {} std::vector CSPreInliner::buildTopDownOrder() { std::vector Order; ProfiledCallGraph ProfiledCG(ContextTracker); // Now that we have a profiled call graph, construct top-down order // by building up SCC and reversing SCC order. scc_iterator I = scc_begin(&ProfiledCG); while (!I.isAtEnd()) { for (ProfiledCallGraphNode *Node : *I) { if (Node != ProfiledCG.getEntryNode()) Order.push_back(Node->Name); } ++I; } std::reverse(Order.begin(), Order.end()); return Order; } bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, const FunctionSamples *CallerSamples) { assert(CallerSamples && "Expect non-null caller samples"); // Ideally we want to consider everything a function calls, but as far as // context profile is concerned, only those frames that are children of // current one in the trie is relavent. So we walk the trie instead of call // targets from function profile. ContextTrieNode *CallerNode = ContextTracker.getContextFor(CallerSamples->getContext()); bool HasNewCandidate = false; for (auto &Child : CallerNode->getAllChildContext()) { ContextTrieNode *CalleeNode = &Child.second; FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); if (!CalleeSamples) continue; // Call site count is more reliable, so we look up the corresponding call // target profile in caller's context profile to retrieve call site count. uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); uint64_t CallsiteCount = 0; LineLocation Callsite = CalleeNode->getCallSiteLoc(); if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); auto It = TargetCounts.find(CalleeSamples->getName()); if (It != TargetCounts.end()) CallsiteCount = It->second; } // TODO: call site and callee entry count should be mostly consistent, add // check for that. HasNewCandidate = true; CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount)); } return HasNewCandidate; } bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { // If replay inline is requested, simply follow the inline decision of the // profiled binary. if (SamplePreInlineReplay) return Candidate.CalleeSamples->getContext().hasAttribute( ContextWasInlined); // Adjust threshold based on call site hotness, only do this for callsite // prioritized inliner because otherwise cost-benefit check is done earlier. unsigned int SampleThreshold = SampleColdCallSiteThreshold; if (Candidate.CallsiteCount > HotCountThreshold) SampleThreshold = SampleHotCallSiteThreshold; // TODO: for small cold functions, we may inlined them and we need to keep // context profile accordingly. if (Candidate.CallsiteCount < ColdCountThreshold) SampleThreshold = SampleColdCallSiteThreshold; return (Candidate.SizeCost < SampleThreshold); } void CSPreInliner::processFunction(const StringRef Name) { LLVM_DEBUG(dbgs() << "Process " << Name << " for context-sensitive pre-inlining\n"); FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); if (!FSamples) return; // Use the number of lines/probes as proxy for function size for now. // TODO: retrieve accurate size from dwarf or binary instead. unsigned FuncSize = FSamples->getBodySamples().size(); unsigned FuncFinalSize = FuncSize; unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); ProfiledCandidateQueue CQueue; getInlineCandidates(CQueue, FSamples); while (!CQueue.empty() && FuncFinalSize < SizeLimit) { ProfiledInlineCandidate Candidate = CQueue.top(); CQueue.pop(); bool ShouldInline = false; if ((ShouldInline = shouldInline(Candidate))) { // We mark context as inlined as the corresponding context profile // won't be merged into that function's base profile. ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); Candidate.CalleeSamples->getContext().setAttribute( ContextShouldBeInlined); FuncFinalSize += Candidate.SizeCost; getInlineCandidates(CQueue, Candidate.CalleeSamples); } LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") << " context profile for: " << Candidate.CalleeSamples->getNameWithContext() << " (callee size: " << Candidate.SizeCost << ", call count:" << Candidate.CallsiteCount << ")\n"); } LLVM_DEBUG({ if (!CQueue.empty()) dbgs() << " Inline candidates ignored due to size limit (inliner " "original size: " << FuncSize << ", inliner final size: " << FuncFinalSize << ", size limit: " << SizeLimit << ")\n"; while (!CQueue.empty()) { ProfiledInlineCandidate Candidate = CQueue.top(); CQueue.pop(); bool WasInlined = Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); dbgs() << " " << Candidate.CalleeSamples->getNameWithContext() << " (candidate size:" << Candidate.SizeCost << ", call count: " << Candidate.CallsiteCount << ", previously " << (WasInlined ? "inlined)\n" : "not inlined)\n"); } }); } void CSPreInliner::run() { if (!EnableCSPreInliner) return; #ifndef NDEBUG auto printProfileNames = [](StringMap &Profiles, bool IsInput) { dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" << Profiles.size() << " total):\n"; for (auto &It : Profiles) { const FunctionSamples &Samples = It.second; dbgs() << " [" << Samples.getNameWithContext() << "] " << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() << "\n"; } }; #endif LLVM_DEBUG(printProfileNames(ProfileMap, true)); // Execute global pre-inliner to estimate a global top-down inline // decision and merge profiles accordingly. This helps with profile // merge for ThinLTO otherwise we won't be able to merge profiles back // to base profile across module/thin-backend boundaries. // It also helps better compress context profile to control profile // size, as we now only need context profile for functions going to // be inlined. for (StringRef FuncName : buildTopDownOrder()) { processFunction(FuncName); } // Not inlined context profiles are merged into its base, so we can // trim out such profiles from the output. std::vector ProfilesToBeRemoved; for (auto &It : ProfileMap) { SampleContext Context = It.second.getContext(); if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { assert(Context.hasState(MergedContext) && "Not inlined context profile should be merged already"); ProfilesToBeRemoved.push_back(It.first()); } } for (StringRef ContextName : ProfilesToBeRemoved) { ProfileMap.erase(ContextName); } // Make sure ProfileMap's key is consistent with FunctionSamples' name. SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); LLVM_DEBUG(printProfileNames(ProfileMap, false)); }