//===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the FrameOptimizerPass class. // //===----------------------------------------------------------------------===// #include "bolt/Passes/FrameOptimizer.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Passes/BinaryFunctionCallGraph.h" #include "bolt/Passes/DataflowInfoManager.h" #include "bolt/Passes/ShrinkWrapping.h" #include "bolt/Passes/StackAvailableExpressions.h" #include "bolt/Passes/StackReachingUses.h" #include "bolt/Utils/CommandLineOpts.h" #include "llvm/Support/Timer.h" #include #include #define DEBUG_TYPE "fop" using namespace llvm; namespace opts { extern cl::opt Verbosity; extern cl::opt TimeOpts; extern cl::OptionCategory BoltOptCategory; using namespace bolt; cl::opt FrameOptimization("frame-opt", cl::init(FOP_NONE), cl::desc("optimize stack frame accesses"), cl::values( clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), clEnumValN(FOP_ALL, "all", "perform FOP on all functions")), cl::ZeroOrMore, cl::cat(BoltOptCategory)); cl::opt RemoveStores( "frame-opt-rm-stores", cl::init(FOP_NONE), cl::desc("apply additional analysis to remove stores (experimental)"), cl::cat(BoltOptCategory)); } // namespace opts namespace llvm { namespace bolt { void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA, const FrameAnalysis &FA, BinaryFunction &BF) { StackAvailableExpressions SAE(RA, FA, BF); SAE.run(); LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n"); std::deque> ToErase; bool Changed = false; const auto ExprEnd = SAE.expr_end(); MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); for (BinaryBasicBlock &BB : BF) { LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); const MCInst *Prev = nullptr; for (MCInst &Inst : BB) { LLVM_DEBUG({ dbgs() << "\t\tNow at "; Inst.dump(); for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); I != ExprEnd; ++I) { dbgs() << "\t\t\tReached by: "; (*I)->dump(); } }); // if Inst is a load from stack and the current available expressions show // this value is available in a register or immediate, replace this load // with move from register or from immediate. ErrorOr FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; continue; } // FIXME: Change to remove IsSimple == 0. We're being conservative here, // but once replaceMemOperandWithReg is ready, we should feed it with all // sorts of complex instructions. if (FIEX->IsLoad == false || FIEX->IsSimple == false || FIEX->StackOffset >= 0) { Prev = &Inst; continue; } for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); I != ExprEnd; ++I) { const MCInst *AvailableInst = *I; ErrorOr FIEY = FA.getFIEFor(*AvailableInst); if (!FIEY) continue; assert(FIEY->IsStore && FIEY->IsSimple); if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) continue; // TODO: Change push/pops to stack adjustment instruction if (MIB->isPop(Inst)) continue; ++NumRedundantLoads; FreqRedundantLoads += BB.getKnownExecutionCount(); Changed = true; LLVM_DEBUG(dbgs() << "Redundant load instruction: "); LLVM_DEBUG(Inst.dump()); LLVM_DEBUG(dbgs() << "Related store instruction: "); LLVM_DEBUG(AvailableInst->dump()); LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); // Replace load if (FIEY->IsStoreFromReg) { if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); break; } FreqLoadsChangedToReg += BB.getKnownExecutionCount(); MIB->removeAnnotation(Inst, "FrameAccessEntry"); LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); if (MIB->isRedundantMove(Inst)) { ++NumLoadsDeleted; FreqLoadsDeleted += BB.getKnownExecutionCount(); LLVM_DEBUG(dbgs() << "Created a redundant move\n"); // Delete it! ToErase.push_front(std::make_pair(&BB, &Inst)); } } else { char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; LLVM_DEBUG(dbgs() << "Changing operand to an imm... "); if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { LLVM_DEBUG(dbgs() << "FAILED\n"); } else { FreqLoadsChangedToImm += BB.getKnownExecutionCount(); MIB->removeAnnotation(Inst, "FrameAccessEntry"); LLVM_DEBUG(dbgs() << "Ok\n"); } } LLVM_DEBUG(dbgs() << "Changed to: "); LLVM_DEBUG(Inst.dump()); break; } Prev = &Inst; } } if (Changed) LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); // TODO: Implement an interface of eraseInstruction that works out the // complete list of elements to remove. for (std::pair I : ToErase) I.first->eraseInstruction(I.first->findInstruction(I.second)); } void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, BinaryFunction &BF) { StackReachingUses SRU(FA, BF); SRU.run(); LLVM_DEBUG(dbgs() << "Performing unused stores removal\n"); std::vector> ToErase; bool Changed = false; for (BinaryBasicBlock &BB : BF) { LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); const MCInst *Prev = nullptr; for (MCInst &Inst : llvm::reverse(BB)) { LLVM_DEBUG({ dbgs() << "\t\tNow at "; Inst.dump(); for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); I != SRU.expr_end(); ++I) { dbgs() << "\t\t\tReached by: "; (*I)->dump(); } }); ErrorOr FIEX = FA.getFIEFor(Inst); if (!FIEX) { Prev = &Inst; continue; } if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { Prev = &Inst; continue; } if (SRU.isStoreUsed(*FIEX, Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { Prev = &Inst; continue; } // TODO: Change push/pops to stack adjustment instruction if (BF.getBinaryContext().MIB->isPush(Inst)) continue; ++NumRedundantStores; FreqRedundantStores += BB.getKnownExecutionCount(); Changed = true; LLVM_DEBUG(dbgs() << "Unused store instruction: "); LLVM_DEBUG(Inst.dump()); LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset << " size = " << (int)FIEX->Size << "\n"); // Delete it! ToErase.emplace_back(&BB, &Inst); Prev = &Inst; } } for (std::pair I : ToErase) I.first->eraseInstruction(I.first->findInstruction(I.second)); if (Changed) LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); } void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) { if (opts::FrameOptimization == FOP_NONE) return; std::unique_ptr CG; std::unique_ptr FA; std::unique_ptr RA; { NamedRegionTimer T1("callgraph", "create call graph", "FOP", "FOP breakdown", opts::TimeOpts); CG = std::make_unique(buildCallGraph(BC)); } { NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP", "FOP breakdown", opts::TimeOpts); FA = std::make_unique(BC, *CG); } { NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown", opts::TimeOpts); RA = std::make_unique(BC, &BC.getBinaryFunctions(), CG.get()); } // Perform caller-saved register optimizations, then callee-saved register // optimizations (shrink wrapping) for (auto &I : BC.getBinaryFunctions()) { if (!FA->hasFrameInfo(I.second)) continue; // Restrict pass execution if user asked to only run on hot functions if (opts::FrameOptimization == FOP_HOT) { if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) continue; LLVM_DEBUG( dbgs() << "Considering " << I.second.getPrintName() << " for frame optimizations because its execution count ( " << I.second.getKnownExecutionCount() << " ) exceeds our hotness threshold ( " << BC.getHotThreshold() << " )\n"); } { NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", opts::TimeOpts); if (!FA->hasStackArithmetic(I.second)) removeUnnecessaryLoads(*RA, *FA, I.second); } if (opts::RemoveStores) { NamedRegionTimer T1("removestores", "remove stores", "FOP", "FOP breakdown", opts::TimeOpts); if (!FA->hasStackArithmetic(I.second)) removeUnusedStores(*FA, I.second); } // Don't even start shrink wrapping if no profiling info is available if (I.second.getKnownExecutionCount() == 0) continue; } { NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP", "FOP breakdown", opts::TimeOpts); performShrinkWrapping(*RA, *FA, BC); } outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads << " redundant load(s) and " << NumRedundantStores << " unused store(s)\n"; outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads << " and frequency of unused stores is " << FreqRedundantStores << "\n"; outs() << "BOLT-INFO: Frequency of loads changed to use a register is " << FreqLoadsChangedToReg << " and frequency of loads changed to use an immediate is " << FreqLoadsChangedToImm << "\n"; outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) (dyn count: " << FreqLoadsDeleted << ") and " << NumRedundantStores << " store(s)\n"; FA->printStats(); ShrinkWrapping::printStats(); } void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA, const FrameAnalysis &FA, BinaryContext &BC) { // Initialize necessary annotations to allow safe parallel accesses to // annotation index in MIB BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName()); BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName()); BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName()); BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName()); BC.MIB->getOrCreateAnnotationIndex( StackLayoutModifier::getOffsetCFIRegTagName()); BC.MIB->getOrCreateAnnotationIndex("ReachingDefs"); BC.MIB->getOrCreateAnnotationIndex("ReachingUses"); BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis"); BC.MIB->getOrCreateAnnotationIndex("StackReachingUses"); BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis"); BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis"); BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking"); BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls"); BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions"); BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis"); BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo"); BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking"); BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward"); BC.MIB->getOrCreateAnnotationIndex("ReachingInsns"); BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); std::vector> Top10Funcs; auto LogFunc = [&](BinaryFunction &BF) { auto Lower = llvm::lower_bound( Top10Funcs, BF.getKnownExecutionCount(), [](const std::pair &Elmt, uint64_t Value) { return Elmt.first > Value; }); if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10) return; Top10Funcs.insert(Lower, std::make_pair<>(BF.getKnownExecutionCount(), &BF)); if (Top10Funcs.size() > 10) Top10Funcs.resize(10); }; (void)LogFunc; ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { if (BF.getFunctionScore() == 0) return true; return false; }; const bool HotOnly = opts::FrameOptimization == FOP_HOT; ParallelUtilities::WorkFuncWithAllocTy WorkFunction = [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); ShrinkWrapping SW(FA, BF, Info, AllocatorId); if (SW.perform(HotOnly)) { std::lock_guard Lock(FuncsChangedMutex); FuncsChanged.insert(&BF); LLVM_DEBUG(LogFunc(BF)); } }; ParallelUtilities::runOnEachFunctionWithUniqueAllocId( BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, SkipPredicate, "shrink-wrapping"); if (!Top10Funcs.empty()) { outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n"; for (const auto &Elmt : Top10Funcs) outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n"; } } } // namespace bolt } // namespace llvm