| //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements multiple passes for binary optimization and analysis. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "bolt/Passes/BinaryPasses.h" |
| #include "bolt/Core/FunctionLayout.h" |
| #include "bolt/Core/ParallelUtilities.h" |
| #include "bolt/Passes/ReorderAlgorithm.h" |
| #include "bolt/Passes/ReorderFunctions.h" |
| #include "bolt/Utils/CommandLineOpts.h" |
| #include "llvm/Support/CommandLine.h" |
| #include <atomic> |
| #include <mutex> |
| #include <numeric> |
| #include <vector> |
| |
| #define DEBUG_TYPE "bolt-opts" |
| |
| using namespace llvm; |
| using namespace bolt; |
| |
| static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { |
| assert(C > bolt::DynoStats::FIRST_DYNO_STAT && |
| C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category."); |
| |
| static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; |
| |
| OptNames[C] = bolt::DynoStats::Description(C); |
| |
| std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); |
| |
| return OptNames[C].c_str(); |
| } |
| |
| namespace opts { |
| |
| extern cl::OptionCategory BoltCategory; |
| extern cl::OptionCategory BoltOptCategory; |
| |
| extern cl::opt<unsigned> Verbosity; |
| extern cl::opt<bool> EnableBAT; |
| extern cl::opt<unsigned> ExecutionCountThreshold; |
| extern cl::opt<bool> UpdateDebugSections; |
| extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; |
| |
| enum DynoStatsSortOrder : char { |
| Ascending, |
| Descending |
| }; |
| |
| static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( |
| "print-sorted-by-order", |
| cl::desc("use ascending or descending order when printing functions " |
| "ordered by dyno stats"), |
| cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); |
| |
| cl::list<std::string> |
| HotTextMoveSections("hot-text-move-sections", |
| cl::desc("list of sections containing functions used for hugifying hot text. " |
| "BOLT makes sure these functions are not placed on the same page as " |
| "the hot text. (default=\'.stub,.mover\')."), |
| cl::value_desc("sec1,sec2,sec3,..."), |
| cl::CommaSeparated, |
| cl::ZeroOrMore, |
| cl::cat(BoltCategory)); |
| |
| bool isHotTextMover(const BinaryFunction &Function) { |
| for (std::string &SectionName : opts::HotTextMoveSections) { |
| if (Function.getOriginSectionName() && |
| *Function.getOriginSectionName() == SectionName) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static cl::opt<bool> MinBranchClusters( |
| "min-branch-clusters", |
| cl::desc("use a modified clustering algorithm geared towards minimizing " |
| "branches"), |
| cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| static cl::list<Peepholes::PeepholeOpts> Peepholes( |
| "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"), |
| cl::value_desc("opt1,opt2,opt3,..."), |
| cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"), |
| clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps", |
| "remove double jumps when able"), |
| clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps", |
| "insert tail call traps"), |
| clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches", |
| "remove useless conditional branches"), |
| clEnumValN(Peepholes::PEEP_ALL, "all", |
| "enable all peephole optimizations")), |
| cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> |
| PrintFuncStat("print-function-statistics", |
| cl::desc("print statistics about basic block ordering"), |
| cl::init(0), cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> PrintLargeFunctions( |
| "print-large-functions", |
| cl::desc("print functions that could not be overwritten due to excessive " |
| "size"), |
| cl::init(false), cl::cat(BoltOptCategory)); |
| |
| static cl::list<bolt::DynoStats::Category> |
| PrintSortedBy("print-sorted-by", cl::CommaSeparated, |
| cl::desc("print functions sorted by order of dyno stats"), |
| cl::value_desc("key1,key2,key3,..."), |
| cl::values( |
| #define D(name, description, ...) \ |
| clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ |
| description), |
| REAL_DYNO_STATS |
| #undef D |
| clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all", |
| "sorted by all names")), |
| cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| PrintUnknown("print-unknown", |
| cl::desc("print names of functions with unknown control flow"), |
| cl::cat(BoltCategory), cl::Hidden); |
| |
| static cl::opt<bool> |
| PrintUnknownCFG("print-unknown-cfg", |
| cl::desc("dump CFG of functions with unknown control flow"), |
| cl::cat(BoltCategory), cl::ReallyHidden); |
| |
| // Please MSVC19 with a forward declaration: otherwise it reports an error about |
| // an undeclared variable inside a callback. |
| extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; |
| cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( |
| "reorder-blocks", cl::desc("change layout of basic blocks in a function"), |
| cl::init(bolt::ReorderBasicBlocks::LT_NONE), |
| cl::values( |
| clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none", |
| "do not reorder basic blocks"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse", |
| "layout blocks in reverse order"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal", |
| "perform optimal layout based on profile"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, |
| "branch-predictor", |
| "perform optimal layout prioritizing branch " |
| "predictions"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache", |
| "perform optimal layout prioritizing I-cache " |
| "behavior"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+", |
| "perform layout optimizing I-cache behavior"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp", |
| "perform layout optimizing I-cache behavior"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, |
| "cluster-shuffle", "perform random layout of clusters")), |
| cl::ZeroOrMore, cl::cat(BoltOptCategory), |
| cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) { |
| if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { |
| errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" |
| << " use '-reorder-blocks=ext-tsp' instead\n"; |
| ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; |
| } |
| })); |
| |
| static cl::opt<unsigned> ReportBadLayout( |
| "report-bad-layout", |
| cl::desc("print top <uint> functions with suboptimal code layout on input"), |
| cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| ReportStaleFuncs("report-stale", |
| cl::desc("print the list of functions with stale profile"), |
| cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| enum SctcModes : char { |
| SctcAlways, |
| SctcPreserveDirection, |
| SctcHeuristic |
| }; |
| |
| static cl::opt<SctcModes> |
| SctcMode("sctc-mode", |
| cl::desc("mode for simplify conditional tail calls"), |
| cl::init(SctcAlways), |
| cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), |
| clEnumValN(SctcPreserveDirection, |
| "preserve", |
| "only perform sctc when branch direction is " |
| "preserved"), |
| clEnumValN(SctcHeuristic, |
| "heuristic", |
| "use branch prediction data to control sctc")), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> |
| StaleThreshold("stale-threshold", |
| cl::desc( |
| "maximum percentage of stale functions to tolerate (default: 100)"), |
| cl::init(100), |
| cl::Hidden, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> TSPThreshold( |
| "tsp-threshold", |
| cl::desc( |
| "maximum number of hot basic blocks in a function for which to use " |
| "a precise TSP solution while re-ordering basic blocks"), |
| cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> TopCalledLimit( |
| "top-called-limit", |
| cl::desc("maximum number of functions to print in top called " |
| "functions section"), |
| cl::init(100), cl::Hidden, cl::cat(BoltCategory)); |
| |
| // Profile density options, synced with llvm-profgen/ProfileGenerator.cpp |
| static cl::opt<int> ProfileDensityCutOffHot( |
| "profile-density-cutoff-hot", cl::init(990000), |
| cl::desc("Total samples cutoff for functions used to calculate " |
| "profile density.")); |
| |
| static cl::opt<double> ProfileDensityThreshold( |
| "profile-density-threshold", cl::init(60), |
| cl::desc("If the profile density is below the given threshold, it " |
| "will be suggested to increase the sampling rate."), |
| cl::Optional); |
| |
| } // namespace opts |
| |
| namespace llvm { |
| namespace bolt { |
| |
| bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { |
| return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && |
| !BF.isIgnored(); |
| } |
| |
| bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { |
| return BF.isSimple() && !BF.isIgnored(); |
| } |
| |
| void NormalizeCFG::runOnFunction(BinaryFunction &BF) { |
| uint64_t NumRemoved = 0; |
| uint64_t NumDuplicateEdges = 0; |
| uint64_t NeedsFixBranches = 0; |
| for (BinaryBasicBlock &BB : BF) { |
| if (!BB.empty()) |
| continue; |
| |
| if (BB.isEntryPoint() || BB.isLandingPad()) |
| continue; |
| |
| // Handle a dangling empty block. |
| if (BB.succ_size() == 0) { |
| // If an empty dangling basic block has a predecessor, it could be a |
| // result of codegen for __builtin_unreachable. In such case, do not |
| // remove the block. |
| if (BB.pred_size() == 0) { |
| BB.markValid(false); |
| ++NumRemoved; |
| } |
| continue; |
| } |
| |
| // The block should have just one successor. |
| BinaryBasicBlock *Successor = BB.getSuccessor(); |
| assert(Successor && "invalid CFG encountered"); |
| |
| // Redirect all predecessors to the successor block. |
| while (!BB.pred_empty()) { |
| BinaryBasicBlock *Predecessor = *BB.pred_begin(); |
| if (Predecessor->hasJumpTable()) |
| break; |
| |
| if (Predecessor == Successor) |
| break; |
| |
| BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB); |
| Predecessor->replaceSuccessor(&BB, Successor, BI.Count, |
| BI.MispredictedCount); |
| // We need to fix branches even if we failed to replace all successors |
| // and remove the block. |
| NeedsFixBranches = true; |
| } |
| |
| if (BB.pred_empty()) { |
| BB.removeAllSuccessors(); |
| BB.markValid(false); |
| ++NumRemoved; |
| } |
| } |
| |
| if (NumRemoved) |
| BF.eraseInvalidBBs(); |
| |
| // Check for duplicate successors. Do it after the empty block elimination as |
| // we can get more duplicate successors. |
| for (BinaryBasicBlock &BB : BF) |
| if (!BB.hasJumpTable() && BB.succ_size() == 2 && |
| BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true)) |
| ++NumDuplicateEdges; |
| |
| // fixBranches() will get rid of duplicate edges and update jump instructions. |
| if (NumDuplicateEdges || NeedsFixBranches) |
| BF.fixBranches(); |
| |
| NumDuplicateEdgesMerged += NumDuplicateEdges; |
| NumBlocksRemoved += NumRemoved; |
| } |
| |
| Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, |
| [&](BinaryFunction &BF) { runOnFunction(BF); }, |
| [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, |
| "NormalizeCFG"); |
| if (NumBlocksRemoved) |
| BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" |
| << (NumBlocksRemoved == 1 ? "" : "s") << '\n'; |
| if (NumDuplicateEdgesMerged) |
| BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged |
| << " duplicate CFG edge" |
| << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n'; |
| return Error::success(); |
| } |
| |
| void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { |
| BinaryContext &BC = Function.getBinaryContext(); |
| unsigned Count; |
| uint64_t Bytes; |
| Function.markUnreachableBlocks(); |
| LLVM_DEBUG({ |
| for (BinaryBasicBlock &BB : Function) { |
| if (!BB.isValid()) { |
| dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() |
| << " in function " << Function << "\n"; |
| Function.dump(); |
| } |
| } |
| }); |
| BinaryContext::IndependentCodeEmitter Emitter = |
| BC.createIndependentMCCodeEmitter(); |
| std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); |
| DeletedBlocks += Count; |
| DeletedBytes += Bytes; |
| if (Count) { |
| auto L = BC.scopeLock(); |
| Modified.insert(&Function); |
| if (opts::Verbosity > 0) |
| BC.outs() << "BOLT-INFO: removed " << Count |
| << " dead basic block(s) accounting for " << Bytes |
| << " bytes in function " << Function << '\n'; |
| } |
| } |
| |
| Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { |
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| runOnFunction(BF); |
| }; |
| |
| ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
| return !shouldOptimize(BF) || BF.getLayout().block_empty(); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, |
| SkipPredicate, "elimininate-unreachable"); |
| |
| if (DeletedBlocks) |
| BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " |
| << DeletedBytes << " bytes of code\n"; |
| return Error::success(); |
| } |
| |
| bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { |
| return (BinaryFunctionPass::shouldPrint(BF) && |
| opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); |
| } |
| |
| bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { |
| // Apply execution count threshold |
| if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) |
| return false; |
| |
| return BinaryFunctionPass::shouldOptimize(BF); |
| } |
| |
| Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { |
| if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) |
| return Error::success(); |
| |
| std::atomic_uint64_t ModifiedFuncCount(0); |
| std::mutex FunctionEditDistanceMutex; |
| DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; |
| |
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; |
| if (opts::PrintFuncStat > 0) |
| llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder)); |
| |
| const bool LayoutChanged = |
| modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters); |
| if (LayoutChanged) { |
| ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed); |
| if (opts::PrintFuncStat > 0) { |
| const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); |
| std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); |
| FunctionEditDistance[&BF] = Distance; |
| } |
| } |
| }; |
| |
| ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| return !shouldOptimize(BF); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, |
| "ReorderBasicBlocks"); |
| const size_t NumAllProfiledFunctions = |
| BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; |
| |
| BC.outs() << "BOLT-INFO: basic block reordering modified layout of " |
| << format( |
| "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n", |
| ModifiedFuncCount.load(std::memory_order_relaxed), |
| 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / |
| NumAllProfiledFunctions, |
| 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) / |
| BC.getBinaryFunctions().size()); |
| |
| if (opts::PrintFuncStat > 0) { |
| raw_ostream &OS = BC.outs(); |
| // Copy all the values into vector in order to sort them |
| std::map<uint64_t, BinaryFunction &> ScoreMap; |
| auto &BFs = BC.getBinaryFunctions(); |
| for (auto It = BFs.begin(); It != BFs.end(); ++It) |
| ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( |
| It->second.getFunctionScore(), It->second)); |
| |
| OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; |
| OS << " There are " << BFs.size() << " functions in total. \n"; |
| OS << " Number of functions being modified: " |
| << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n"; |
| OS << " User asks for detailed information on top " |
| << opts::PrintFuncStat << " functions. (Ranked by function score)" |
| << "\n\n"; |
| uint64_t I = 0; |
| for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = |
| ScoreMap.rbegin(); |
| Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { |
| BinaryFunction &Function = Rit->second; |
| |
| OS << " Information for function of top: " << (I + 1) << ": \n"; |
| OS << " Function Score is: " << Function.getFunctionScore() |
| << "\n"; |
| OS << " There are " << Function.size() |
| << " number of blocks in this function.\n"; |
| OS << " There are " << Function.getInstructionCount() |
| << " number of instructions in this function.\n"; |
| OS << " The edit distance for this function is: " |
| << FunctionEditDistance.lookup(&Function) << "\n\n"; |
| } |
| } |
| return Error::success(); |
| } |
| |
| bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, |
| LayoutType Type, |
| bool MinBranchClusters) const { |
| if (BF.size() == 0 || Type == LT_NONE) |
| return false; |
| |
| BinaryFunction::BasicBlockOrderType NewLayout; |
| std::unique_ptr<ReorderAlgorithm> Algo; |
| |
| // Cannot do optimal layout without profile. |
| if (Type != LT_REVERSE && !BF.hasValidProfile()) |
| return false; |
| |
| if (Type == LT_REVERSE) { |
| Algo.reset(new ReverseReorderAlgorithm()); |
| } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { |
| // Work on optimal solution if problem is small enough |
| LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); |
| Algo.reset(new TSPReorderAlgorithm()); |
| } else { |
| LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); |
| |
| std::unique_ptr<ClusterAlgorithm> CAlgo; |
| if (MinBranchClusters) |
| CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); |
| else |
| CAlgo.reset(new PHGreedyClusterAlgorithm()); |
| |
| switch (Type) { |
| case LT_OPTIMIZE: |
| Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_BRANCH: |
| Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_CACHE: |
| Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_EXT_TSP: |
| Algo.reset(new ExtTSPReorderAlgorithm()); |
| break; |
| |
| case LT_OPTIMIZE_SHUFFLE: |
| Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| default: |
| llvm_unreachable("unexpected layout type"); |
| } |
| } |
| |
| Algo->reorderBasicBlocks(BF, NewLayout); |
| |
| return BF.getLayout().update(NewLayout); |
| } |
| |
| Error FixupBranches::runOnFunctions(BinaryContext &BC) { |
| for (auto &It : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = It.second; |
| if (!BC.shouldEmit(Function) || !Function.isSimple()) |
| continue; |
| |
| Function.fixBranches(); |
| } |
| return Error::success(); |
| } |
| |
| Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { |
| std::atomic<bool> HasFatal{false}; |
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| if (!BF.finalizeCFIState()) { |
| if (BC.HasRelocations) { |
| BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF |
| << ". Exiting.\n"; |
| HasFatal = true; |
| return; |
| } |
| BF.setSimple(false); |
| return; |
| } |
| |
| BF.setFinalized(); |
| |
| // Update exception handling information. |
| BF.updateEHRanges(); |
| }; |
| |
| ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
| return !BC.shouldEmit(BF); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, |
| SkipPredicate, "FinalizeFunctions"); |
| if (HasFatal) |
| return createFatalBOLTError("finalize CFI state failure"); |
| return Error::success(); |
| } |
| |
| Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { |
| if (BC.HasRelocations) |
| return Error::success(); |
| |
| // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit |
| // incorrect meta data. |
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| uint64_t HotSize, ColdSize; |
| std::tie(HotSize, ColdSize) = |
| BC.calculateEmittedSize(BF, /*FixBranches=*/false); |
| uint64_t MainFragmentSize = HotSize; |
| if (BF.hasIslandsInfo()) { |
| MainFragmentSize += |
| offsetToAlignment(BF.getAddress() + MainFragmentSize, |
| Align(BF.getConstantIslandAlignment())); |
| MainFragmentSize += BF.estimateConstantIslandSize(); |
| } |
| if (MainFragmentSize > BF.getMaxSize()) { |
| if (opts::PrintLargeFunctions) |
| BC.outs() << "BOLT-INFO: " << BF << " size of " << MainFragmentSize |
| << " bytes exceeds allocated space by " |
| << (MainFragmentSize - BF.getMaxSize()) << " bytes\n"; |
| BF.setSimple(false); |
| } |
| }; |
| |
| ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| return !shouldOptimize(BF); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, |
| SkipFunc, "CheckLargeFunctions"); |
| |
| return Error::success(); |
| } |
| |
| bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { |
| // Unlike other passes, allow functions in non-CFG state. |
| return BF.isSimple() && !BF.isIgnored(); |
| } |
| |
| Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { |
| // Convert GnuArgsSize annotations into CFIs. |
| for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { |
| for (FunctionFragment &FF : BF->getLayout().fragments()) { |
| // Reset at the start of the new fragment. |
| int64_t CurrentGnuArgsSize = 0; |
| |
| for (BinaryBasicBlock *const BB : FF) { |
| for (auto II = BB->begin(); II != BB->end(); ++II) { |
| if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II)) |
| continue; |
| |
| const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); |
| assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size."); |
| if (NewGnuArgsSize == CurrentGnuArgsSize) |
| continue; |
| |
| auto InsertII = BF->addCFIInstruction( |
| BB, II, |
| MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); |
| CurrentGnuArgsSize = NewGnuArgsSize; |
| II = std::next(InsertII); |
| } |
| } |
| } |
| } |
| return Error::success(); |
| } |
| |
| // Check for dirty state in MCSymbol objects that might be a consequence |
| // of running calculateEmittedSize() in parallel, during split functions |
| // pass. If an inconsistent state is found (symbol already registered or |
| // already defined), clean it. |
| Error CleanMCState::runOnFunctions(BinaryContext &BC) { |
| MCContext &Ctx = *BC.Ctx; |
| for (const auto &SymMapEntry : Ctx.getSymbols()) { |
| const MCSymbol *S = SymMapEntry.getValue().Symbol; |
| if (!S) |
| continue; |
| if (S->isDefined()) { |
| LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
| << "\" is already defined\n"); |
| const_cast<MCSymbol *>(S)->setUndefined(); |
| } |
| if (S->isRegistered()) { |
| LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
| << "\" is already registered\n"); |
| const_cast<MCSymbol *>(S)->setIsRegistered(false); |
| } |
| LLVM_DEBUG(if (S->isVariable()) { |
| dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n"; |
| }); |
| } |
| return Error::success(); |
| } |
| |
| // This peephole fixes jump instructions that jump to another basic |
| // block with a single jump instruction, e.g. |
| // |
| // B0: ... |
| // jmp B1 (or jcc B1) |
| // |
| // B1: jmp B2 |
| // |
| // -> |
| // |
| // B0: ... |
| // jmp B2 (or jcc B2) |
| // |
| static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { |
| uint64_t NumDoubleJumps = 0; |
| |
| MCContext *Ctx = Function.getBinaryContext().Ctx.get(); |
| MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
| for (BinaryBasicBlock &BB : Function) { |
| auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, |
| const MCSymbol *SuccSym, |
| std::optional<uint32_t> Offset) { |
| // Ignore infinite loop jumps or fallthrough tail jumps. |
| if (Pred == Succ || Succ == &BB) |
| return false; |
| |
| if (Succ) { |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| if (!Res) { |
| LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; |
| Pred->dump()); |
| return false; |
| } |
| Pred->replaceSuccessor(&BB, Succ); |
| |
| // We must patch up any existing branch instructions to match up |
| // with the new successor. |
| assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && |
| "Predecessor block has inconsistent number of successors"); |
| if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) { |
| MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); |
| } else if (UncondBranch && |
| MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) { |
| MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); |
| } else if (!UncondBranch) { |
| assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && |
| "Don't add an explicit jump to a fallthrough block."); |
| Pred->addBranchInstruction(Succ); |
| } |
| } else { |
| // Succ will be null in the tail call case. In this case we |
| // need to explicitly add a tail call instruction. |
| MCInst *Branch = Pred->getLastNonPseudoInstr(); |
| if (Branch && MIB->isUnconditionalBranch(*Branch)) { |
| assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); |
| Pred->removeSuccessor(&BB); |
| Pred->eraseInstruction(Pred->findInstruction(Branch)); |
| Pred->addTailCallInstruction(SuccSym); |
| if (Offset) { |
| MCInst *TailCall = Pred->getLastNonPseudoInstr(); |
| assert(TailCall); |
| MIB->setOffset(*TailCall, *Offset); |
| } |
| } else { |
| return false; |
| } |
| } |
| |
| ++NumDoubleJumps; |
| LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " |
| << Pred->getName() << " -> " << BB.getName() << " to " |
| << Pred->getName() << " -> " << SuccSym->getName() |
| << (!Succ ? " (tail)\n" : "\n")); |
| |
| return true; |
| }; |
| |
| if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) |
| continue; |
| |
| MCInst *Inst = BB.getFirstNonPseudoInstr(); |
| const bool IsTailCall = MIB->isTailCall(*Inst); |
| |
| if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall) |
| continue; |
| |
| // If we operate after SCTC make sure it's not a conditional tail call. |
| if (IsTailCall && MIB->isConditionalBranch(*Inst)) |
| continue; |
| |
| const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst); |
| BinaryBasicBlock *Succ = BB.getSuccessor(); |
| |
| if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) |
| continue; |
| |
| std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; |
| |
| for (BinaryBasicBlock *Pred : Preds) { |
| if (Pred->isLandingPad()) |
| continue; |
| |
| if (Pred->getSuccessor() == &BB || |
| (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || |
| Pred->getConditionalSuccessor(false) == &BB) |
| if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) && |
| MarkInvalid) |
| BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() || |
| BB.isEntryPoint()); |
| } |
| } |
| |
| return NumDoubleJumps; |
| } |
| |
| bool SimplifyConditionalTailCalls::shouldRewriteBranch( |
| const BinaryBasicBlock *PredBB, const MCInst &CondBranch, |
| const BinaryBasicBlock *BB, const bool DirectionFlag) { |
| if (BeenOptimized.count(PredBB)) |
| return false; |
| |
| const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); |
| |
| if (IsForward) |
| ++NumOrigForwardBranches; |
| else |
| ++NumOrigBackwardBranches; |
| |
| if (opts::SctcMode == opts::SctcAlways) |
| return true; |
| |
| if (opts::SctcMode == opts::SctcPreserveDirection) |
| return IsForward == DirectionFlag; |
| |
| const ErrorOr<std::pair<double, double>> Frequency = |
| PredBB->getBranchStats(BB); |
| |
| // It's ok to rewrite the conditional branch if the new target will be |
| // a backward branch. |
| |
| // If no data available for these branches, then it should be ok to |
| // do the optimization since it will reduce code size. |
| if (Frequency.getError()) |
| return true; |
| |
| // TODO: should this use misprediction frequency instead? |
| const bool Result = (IsForward && Frequency.get().first >= 0.5) || |
| (!IsForward && Frequency.get().first <= 0.5); |
| |
| return Result == DirectionFlag; |
| } |
| |
| uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { |
| // Need updated indices to correctly detect branch' direction. |
| BF.getLayout().updateLayoutIndices(); |
| BF.markUnreachableBlocks(); |
| |
| MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); |
| MCContext *Ctx = BF.getBinaryContext().Ctx.get(); |
| uint64_t NumLocalCTCCandidates = 0; |
| uint64_t NumLocalCTCs = 0; |
| uint64_t LocalCTCTakenCount = 0; |
| uint64_t LocalCTCExecCount = 0; |
| std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> |
| NeedsUncondBranch; |
| |
| // Will block be deleted by UCE? |
| auto isValid = [](const BinaryBasicBlock *BB) { |
| return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); |
| }; |
| |
| for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| // Locate BB with a single direct tail-call instruction. |
| if (BB->getNumNonPseudos() != 1) |
| continue; |
| |
| MCInst *Instr = BB->getFirstNonPseudoInstr(); |
| if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr)) |
| continue; |
| |
| const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr); |
| if (!CalleeSymbol) |
| continue; |
| |
| // Detect direction of the possible conditional tail call. |
| const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); |
| |
| // Iterate through all predecessors. |
| for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
| BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true); |
| if (!CondSucc) |
| continue; |
| |
| ++NumLocalCTCCandidates; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz |
| if (!Result) { |
| LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; |
| PredBB->dump()); |
| continue; |
| } |
| |
| assert(Result && "internal error analyzing conditional branch"); |
| assert(CondBranch && "conditional branch expected"); |
| |
| // Skip dynamic branches for now. |
| if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch)) |
| continue; |
| |
| // It's possible that PredBB is also a successor to BB that may have |
| // been processed by a previous iteration of the SCTC loop, in which |
| // case it may have been marked invalid. We should skip rewriting in |
| // this case. |
| if (!PredBB->isValid()) { |
| assert(PredBB->isSuccessor(BB) && |
| "PredBB should be valid if it is not a successor to BB"); |
| continue; |
| } |
| |
| // We don't want to reverse direction of the branch in new order |
| // without further profile analysis. |
| const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; |
| if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) |
| continue; |
| |
| // Record this block so that we don't try to optimize it twice. |
| BeenOptimized.insert(PredBB); |
| |
| uint64_t Count = 0; |
| if (CondSucc != BB) { |
| // Patch the new target address into the conditional branch. |
| MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx); |
| // Since we reversed the condition on the branch we need to change |
| // the target for the unconditional branch or add a unconditional |
| // branch to the old target. This has to be done manually since |
| // fixupBranches is not called after SCTC. |
| NeedsUncondBranch.emplace_back(PredBB, CondSucc); |
| Count = PredBB->getFallthroughBranchInfo().Count; |
| } else { |
| // Change destination of the conditional branch. |
| MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx); |
| Count = PredBB->getTakenBranchInfo().Count; |
| } |
| const uint64_t CTCTakenFreq = |
| Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; |
| |
| // Annotate it, so "isCall" returns true for this jcc |
| MIB->setConditionalTailCall(*CondBranch); |
| // Add info about the conditional tail call frequency, otherwise this |
| // info will be lost when we delete the associated BranchInfo entry |
| auto &CTCAnnotation = |
| MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount"); |
| CTCAnnotation = CTCTakenFreq; |
| // Preserve Offset annotation, used in BAT. |
| // Instr is a direct tail call instruction that was created when CTCs are |
| // first expanded, and has the original CTC offset set. |
| if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr)) |
| MIB->setOffset(*CondBranch, *Offset); |
| |
| // Remove the unused successor which may be eliminated later |
| // if there are no other users. |
| PredBB->removeSuccessor(BB); |
| // Update BB execution count |
| if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) |
| BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); |
| else if (CTCTakenFreq > BB->getKnownExecutionCount()) |
| BB->setExecutionCount(0); |
| |
| ++NumLocalCTCs; |
| LocalCTCTakenCount += CTCTakenFreq; |
| LocalCTCExecCount += PredBB->getKnownExecutionCount(); |
| } |
| |
| // Remove the block from CFG if all predecessors were removed. |
| BB->markValid(isValid(BB)); |
| } |
| |
| // Add unconditional branches at the end of BBs to new successors |
| // as long as the successor is not a fallthrough. |
| for (auto &Entry : NeedsUncondBranch) { |
| BinaryBasicBlock *PredBB = Entry.first; |
| const BinaryBasicBlock *CondSucc = Entry.second; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // Find the next valid block. Invalid blocks will be deleted |
| // so they shouldn't be considered fallthrough targets. |
| const BinaryBasicBlock *NextBlock = |
| BF.getLayout().getBasicBlockAfter(PredBB, false); |
| while (NextBlock && !isValid(NextBlock)) |
| NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false); |
| |
| // Get the unconditional successor to this block. |
| const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); |
| assert(PredSucc && "The other branch should be a tail call"); |
| |
| const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); |
| |
| if (UncondBranch) { |
| if (HasFallthrough) |
| PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch)); |
| else |
| MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx); |
| } else if (!HasFallthrough) { |
| MCInst Branch; |
| MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx); |
| PredBB->addInstruction(Branch); |
| } |
| } |
| |
| if (NumLocalCTCs > 0) { |
| NumDoubleJumps += fixDoubleJumps(BF, true); |
| // Clean-up unreachable tail-call blocks. |
| const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); |
| DeletedBlocks += Stats.first; |
| DeletedBytes += Stats.second; |
| |
| assert(BF.validateCFG()); |
| } |
| |
| LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs |
| << " conditional tail calls from a total of " |
| << NumLocalCTCCandidates << " candidates in function " << BF |
| << ". CTCs execution count for this function is " |
| << LocalCTCExecCount << " and CTC taken count is " |
| << LocalCTCTakenCount << "\n";); |
| |
| NumTailCallsPatched += NumLocalCTCs; |
| NumCandidateTailCalls += NumLocalCTCCandidates; |
| CTCExecCount += LocalCTCExecCount; |
| CTCTakenCount += LocalCTCTakenCount; |
| |
| return NumLocalCTCs > 0; |
| } |
| |
| Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { |
| if (!BC.isX86()) |
| return Error::success(); |
| |
| for (auto &It : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = It.second; |
| |
| if (!shouldOptimize(Function)) |
| continue; |
| |
| if (fixTailCalls(Function)) { |
| Modified.insert(&Function); |
| Function.setHasCanonicalCFG(false); |
| } |
| } |
| |
| if (NumTailCallsPatched) |
| BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched |
| << " tail calls (" << NumOrigForwardBranches << " forward)" |
| << " tail calls (" << NumOrigBackwardBranches << " backward)" |
| << " from a total of " << NumCandidateTailCalls |
| << " while removing " << NumDoubleJumps << " double jumps" |
| << " and removing " << DeletedBlocks << " basic blocks" |
| << " totalling " << DeletedBytes |
| << " bytes of code. CTCs total execution count is " |
| << CTCExecCount << " and the number of times CTCs are taken is " |
| << CTCTakenCount << "\n"; |
| return Error::success(); |
| } |
| |
| uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { |
| uint64_t Count = 0; |
| const BinaryContext &BC = Function.getBinaryContext(); |
| for (BinaryBasicBlock &BB : Function) { |
| for (MCInst &Inst : BB) { |
| // Skip shortening instructions with Size annotation. |
| if (BC.MIB->getSize(Inst)) |
| continue; |
| |
| MCInst OriginalInst; |
| if (opts::Verbosity > 2) |
| OriginalInst = Inst; |
| |
| if (!BC.MIB->shortenInstruction(Inst, *BC.STI)) |
| continue; |
| |
| if (opts::Verbosity > 2) { |
| BC.scopeLock(); |
| BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: "; |
| BC.printInstruction(BC.outs(), OriginalInst, 0, &Function); |
| BC.outs() << "BOLT-INFO: to:"; |
| BC.printInstruction(BC.outs(), Inst, 0, &Function); |
| } |
| |
| ++Count; |
| } |
| } |
| |
| return Count; |
| } |
| |
| Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { |
| std::atomic<uint64_t> NumShortened{0}; |
| if (!BC.isX86()) |
| return Error::success(); |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, |
| [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); }, |
| nullptr, "ShortenInstructions"); |
| |
| if (NumShortened) |
| BC.outs() << "BOLT-INFO: " << NumShortened |
| << " instructions were shortened\n"; |
| return Error::success(); |
| } |
| |
| void Peepholes::addTailcallTraps(BinaryFunction &Function) { |
| MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
| for (BinaryBasicBlock &BB : Function) { |
| MCInst *Inst = BB.getLastNonPseudoInstr(); |
| if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) { |
| MCInst Trap; |
| MIB->createTrap(Trap); |
| BB.addInstruction(Trap); |
| ++TailCallTraps; |
| } |
| } |
| } |
| |
| void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { |
| for (BinaryBasicBlock &BB : Function) { |
| if (BB.succ_size() != 2) |
| continue; |
| |
| BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true); |
| BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false); |
| if (CondBB != UncondBB) |
| continue; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // analyzeBranch() can fail due to unusual branch instructions, |
| // e.g. jrcxz, or jump tables (indirect jump). |
| if (!Result || !CondBranch) |
| continue; |
| |
| BB.removeDuplicateConditionalSuccessor(CondBranch); |
| ++NumUselessCondBranches; |
| } |
| } |
| |
| Error Peepholes::runOnFunctions(BinaryContext &BC) { |
| const char Opts = |
| std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0, |
| [](const char A, const PeepholeOpts B) { return A | B; }); |
| if (Opts == PEEP_NONE) |
| return Error::success(); |
| |
| for (auto &It : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = It.second; |
| if (shouldOptimize(Function)) { |
| if (Opts & PEEP_DOUBLE_JUMPS) |
| NumDoubleJumps += fixDoubleJumps(Function, false); |
| if (Opts & PEEP_TAILCALL_TRAPS) |
| addTailcallTraps(Function); |
| if (Opts & PEEP_USELESS_BRANCHES) |
| removeUselessCondBranches(Function); |
| assert(Function.validateCFG()); |
| } |
| } |
| BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps |
| << " double jumps patched.\n" |
| << "BOLT-INFO: Peephole: " << TailCallTraps |
| << " tail call traps inserted.\n" |
| << "BOLT-INFO: Peephole: " << NumUselessCondBranches |
| << " useless conditional branches removed.\n"; |
| return Error::success(); |
| } |
| |
| bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { |
| BinaryContext &BC = BF.getBinaryContext(); |
| MCPlusBuilder *MIB = BC.MIB.get(); |
| |
| uint64_t NumLocalLoadsSimplified = 0; |
| uint64_t NumDynamicLocalLoadsSimplified = 0; |
| uint64_t NumLocalLoadsFound = 0; |
| uint64_t NumDynamicLocalLoadsFound = 0; |
| |
| for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| for (MCInst &Inst : *BB) { |
| unsigned Opcode = Inst.getOpcode(); |
| const MCInstrDesc &Desc = BC.MII->get(Opcode); |
| |
| // Skip instructions that do not load from memory. |
| if (!Desc.mayLoad()) |
| continue; |
| |
| // Try to statically evaluate the target memory address; |
| uint64_t TargetAddress; |
| |
| if (MIB->hasPCRelOperand(Inst)) { |
| // Try to find the symbol that corresponds to the PC-relative operand. |
| MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); |
| assert(DispOpI != Inst.end() && "expected PC-relative displacement"); |
| assert(DispOpI->isExpr() && |
| "found PC-relative with non-symbolic displacement"); |
| |
| // Get displacement symbol. |
| const MCSymbol *DisplSymbol; |
| uint64_t DisplOffset; |
| |
| std::tie(DisplSymbol, DisplOffset) = |
| MIB->getTargetSymbolInfo(DispOpI->getExpr()); |
| |
| if (!DisplSymbol) |
| continue; |
| |
| // Look up the symbol address in the global symbols map of the binary |
| // context object. |
| BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName()); |
| if (!BD) |
| continue; |
| TargetAddress = BD->getAddress() + DisplOffset; |
| } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) { |
| continue; |
| } |
| |
| // Get the contents of the section containing the target address of the |
| // memory operand. We are only interested in read-only sections. |
| ErrorOr<BinarySection &> DataSection = |
| BC.getSectionForAddress(TargetAddress); |
| if (!DataSection || DataSection->isWritable()) |
| continue; |
| |
| if (BC.getRelocationAt(TargetAddress) || |
| BC.getDynamicRelocationAt(TargetAddress)) |
| continue; |
| |
| uint32_t Offset = TargetAddress - DataSection->getAddress(); |
| StringRef ConstantData = DataSection->getContents(); |
| |
| ++NumLocalLoadsFound; |
| if (BB->hasProfile()) |
| NumDynamicLocalLoadsFound += BB->getExecutionCount(); |
| |
| if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { |
| ++NumLocalLoadsSimplified; |
| if (BB->hasProfile()) |
| NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); |
| } |
| } |
| } |
| |
| NumLoadsFound += NumLocalLoadsFound; |
| NumDynamicLoadsFound += NumDynamicLocalLoadsFound; |
| NumLoadsSimplified += NumLocalLoadsSimplified; |
| NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; |
| |
| return NumLocalLoadsSimplified > 0; |
| } |
| |
| Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { |
| for (auto &It : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = It.second; |
| if (shouldOptimize(Function) && simplifyRODataLoads(Function)) |
| Modified.insert(&Function); |
| } |
| |
| BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " |
| << NumLoadsFound << " loads from a statically computed address.\n" |
| << "BOLT-INFO: dynamic loads simplified: " |
| << NumDynamicLoadsSimplified << "\n" |
| << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound |
| << "\n"; |
| return Error::success(); |
| } |
| |
| Error AssignSections::runOnFunctions(BinaryContext &BC) { |
| for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { |
| Function->setCodeSectionName(BC.getInjectedCodeSectionName()); |
| Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); |
| } |
| |
| // In non-relocation mode functions have pre-assigned section names. |
| if (!BC.HasRelocations) |
| return Error::success(); |
| |
| const bool UseColdSection = |
| BC.NumProfiledFuncs > 0 || |
| opts::ReorderFunctions == ReorderFunctions::RT_USER; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = BFI.second; |
| if (opts::isHotTextMover(Function)) { |
| Function.setCodeSectionName(BC.getHotTextMoverSectionName()); |
| Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); |
| continue; |
| } |
| |
| if (!UseColdSection || Function.hasValidIndex()) |
| Function.setCodeSectionName(BC.getMainCodeSectionName()); |
| else |
| Function.setCodeSectionName(BC.getColdCodeSectionName()); |
| |
| if (Function.isSplit()) |
| Function.setColdCodeSectionName(BC.getColdCodeSectionName()); |
| } |
| return Error::success(); |
| } |
| |
| Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { |
| double FlowImbalanceMean = 0.0; |
| size_t NumBlocksConsidered = 0; |
| double WorstBias = 0.0; |
| const BinaryFunction *WorstBiasFunc = nullptr; |
| |
| // For each function CFG, we fill an IncomingMap with the sum of the frequency |
| // of incoming edges for each BB. Likewise for each OutgoingMap and the sum |
| // of the frequency of outgoing edges. |
| using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; |
| std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; |
| std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; |
| |
| // Compute mean |
| for (const auto &BFI : BC.getBinaryFunctions()) { |
| const BinaryFunction &Function = BFI.second; |
| if (Function.empty() || !Function.isSimple()) |
| continue; |
| FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
| FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
| for (const BinaryBasicBlock &BB : Function) { |
| uint64_t TotalOutgoing = 0ULL; |
| auto SuccBIIter = BB.branch_info_begin(); |
| for (BinaryBasicBlock *Succ : BB.successors()) { |
| uint64_t Count = SuccBIIter->Count; |
| if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { |
| ++SuccBIIter; |
| continue; |
| } |
| TotalOutgoing += Count; |
| IncomingMap[Succ] += Count; |
| ++SuccBIIter; |
| } |
| OutgoingMap[&BB] = TotalOutgoing; |
| } |
| |
| size_t NumBlocks = 0; |
| double Mean = 0.0; |
| for (const BinaryBasicBlock &BB : Function) { |
| // Do not compute score for low frequency blocks, entry or exit blocks |
| if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) |
| continue; |
| ++NumBlocks; |
| const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
| Mean += fabs(Difference / IncomingMap[&BB]); |
| } |
| |
| FlowImbalanceMean += Mean; |
| NumBlocksConsidered += NumBlocks; |
| if (!NumBlocks) |
| continue; |
| double FuncMean = Mean / NumBlocks; |
| if (FuncMean > WorstBias) { |
| WorstBias = FuncMean; |
| WorstBiasFunc = &Function; |
| } |
| } |
| if (NumBlocksConsidered > 0) |
| FlowImbalanceMean /= NumBlocksConsidered; |
| |
| // Compute standard deviation |
| NumBlocksConsidered = 0; |
| double FlowImbalanceVar = 0.0; |
| for (const auto &BFI : BC.getBinaryFunctions()) { |
| const BinaryFunction &Function = BFI.second; |
| if (Function.empty() || !Function.isSimple()) |
| continue; |
| FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
| FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
| for (const BinaryBasicBlock &BB : Function) { |
| if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) |
| continue; |
| ++NumBlocksConsidered; |
| const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
| FlowImbalanceVar += |
| pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2); |
| } |
| } |
| if (NumBlocksConsidered) { |
| FlowImbalanceVar /= NumBlocksConsidered; |
| FlowImbalanceVar = sqrt(FlowImbalanceVar); |
| } |
| |
| // Report to user |
| BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n", |
| (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar)); |
| if (WorstBiasFunc && opts::Verbosity >= 1) { |
| BC.outs() << "Worst average bias observed in " |
| << WorstBiasFunc->getPrintName() << "\n"; |
| LLVM_DEBUG(WorstBiasFunc->dump()); |
| } |
| return Error::success(); |
| } |
| |
| Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { |
| uint64_t NumRegularFunctions = 0; |
| uint64_t NumStaleProfileFunctions = 0; |
| uint64_t NumAllStaleFunctions = 0; |
| uint64_t NumInferredFunctions = 0; |
| uint64_t NumNonSimpleProfiledFunctions = 0; |
| uint64_t NumUnknownControlFlowFunctions = 0; |
| uint64_t TotalSampleCount = 0; |
| uint64_t StaleSampleCount = 0; |
| uint64_t InferredSampleCount = 0; |
| std::vector<const BinaryFunction *> ProfiledFunctions; |
| std::vector<std::pair<double, uint64_t>> FuncDensityList; |
| const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| const BinaryFunction &Function = BFI.second; |
| |
| // Ignore PLT functions for stats. |
| if (Function.isPLTFunction()) |
| continue; |
| |
| // Adjustment for BAT mode: the profile for BOLT split fragments is combined |
| // so only count the hot fragment. |
| const uint64_t Address = Function.getAddress(); |
| bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() && |
| BAT && BAT->isBATFunction(Address) && |
| !BAT->fetchParentAddress(Address); |
| |
| ++NumRegularFunctions; |
| |
| // In BOLTed binaries split functions are non-simple (due to non-relocation |
| // mode), but the original function is known to be simple and we have a |
| // valid profile for it. |
| if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) { |
| if (Function.hasProfile()) |
| ++NumNonSimpleProfiledFunctions; |
| continue; |
| } |
| |
| if (Function.hasUnknownControlFlow()) { |
| if (opts::PrintUnknownCFG) |
| Function.dump(); |
| else if (opts::PrintUnknown) |
| BC.errs() << "function with unknown control flow: " << Function << '\n'; |
| |
| ++NumUnknownControlFlowFunctions; |
| } |
| |
| if (!Function.hasProfile()) |
| continue; |
| |
| uint64_t SampleCount = Function.getRawBranchCount(); |
| TotalSampleCount += SampleCount; |
| |
| if (Function.hasValidProfile()) { |
| ProfiledFunctions.push_back(&Function); |
| if (Function.hasInferredProfile()) { |
| ++NumInferredFunctions; |
| InferredSampleCount += SampleCount; |
| ++NumAllStaleFunctions; |
| } |
| } else { |
| if (opts::ReportStaleFuncs) { |
| BC.outs() << StaleFuncsHeader; |
| StaleFuncsHeader = ""; |
| BC.outs() << " " << Function << '\n'; |
| } |
| ++NumStaleProfileFunctions; |
| StaleSampleCount += SampleCount; |
| ++NumAllStaleFunctions; |
| } |
| |
| if (opts::ShowDensity) { |
| uint64_t Size = Function.getSize(); |
| // In case of BOLT split functions registered in BAT, executed traces are |
| // automatically attributed to the main fragment. Add up function sizes |
| // for all fragments. |
| if (IsHotParentOfBOLTSplitFunction) |
| for (const BinaryFunction *Fragment : Function.getFragments()) |
| Size += Fragment->getSize(); |
| double Density = (double)1.0 * Function.getSampleCountInBytes() / Size; |
| FuncDensityList.emplace_back(Density, SampleCount); |
| LLVM_DEBUG(BC.outs() << Function << ": executed bytes " |
| << Function.getSampleCountInBytes() << ", size (b) " |
| << Size << ", density " << Density |
| << ", sample count " << SampleCount << '\n'); |
| } |
| } |
| BC.NumProfiledFuncs = ProfiledFunctions.size(); |
| BC.NumStaleProfileFuncs = NumStaleProfileFunctions; |
| |
| const size_t NumAllProfiledFunctions = |
| ProfiledFunctions.size() + NumStaleProfileFunctions; |
| BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " |
| << NumRegularFunctions << " functions in the binary (" |
| << format("%.1f", NumAllProfiledFunctions / |
| (float)NumRegularFunctions * 100.0f) |
| << "%) have non-empty execution profile\n"; |
| if (NumNonSimpleProfiledFunctions) { |
| BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" |
| << (NumNonSimpleProfiledFunctions == 1 ? "" : "s") |
| << " with profile could not be optimized\n"; |
| } |
| if (NumAllStaleFunctions) { |
| const float PctStale = |
| NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; |
| const float PctStaleFuncsWithEqualBlockCount = |
| (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / |
| NumAllStaleFunctions * 100.0f; |
| const float PctStaleBlocksWithEqualIcount = |
| (float)BC.Stats.NumStaleBlocksWithEqualIcount / |
| BC.Stats.NumStaleBlocks * 100.0f; |
| auto printErrorOrWarning = [&]() { |
| if (PctStale > opts::StaleThreshold) |
| BC.errs() << "BOLT-ERROR: "; |
| else |
| BC.errs() << "BOLT-WARNING: "; |
| }; |
| printErrorOrWarning(); |
| BC.errs() << NumAllStaleFunctions |
| << format(" (%.1f%% of all profiled)", PctStale) << " function" |
| << (NumAllStaleFunctions == 1 ? "" : "s") |
| << " have invalid (possibly stale) profile." |
| " Use -report-stale to see the list.\n"; |
| if (TotalSampleCount > 0) { |
| printErrorOrWarning(); |
| BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " |
| << TotalSampleCount << " samples in the binary (" |
| << format("%.1f", |
| ((100.0f * (StaleSampleCount + InferredSampleCount)) / |
| TotalSampleCount)) |
| << "%) belong to functions with invalid" |
| " (possibly stale) profile.\n"; |
| } |
| BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount |
| << " stale function" |
| << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s") |
| << format(" (%.1f%% of all stale)", |
| PctStaleFuncsWithEqualBlockCount) |
| << " have matching block count.\n"; |
| BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount |
| << " stale block" |
| << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s") |
| << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount) |
| << " have matching icount.\n"; |
| if (PctStale > opts::StaleThreshold) { |
| return createFatalBOLTError( |
| Twine("BOLT-ERROR: stale functions exceed specified threshold of ") + |
| Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n")); |
| } |
| } |
| if (NumInferredFunctions) { |
| BC.outs() << format( |
| "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " |
| "%.2f%% of stale) functions responsible for %.2f%% samples" |
| " (%zu out of %zu)\n", |
| NumInferredFunctions, |
| 100.0 * NumInferredFunctions / NumAllProfiledFunctions, |
| 100.0 * NumInferredFunctions / NumAllStaleFunctions, |
| 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, |
| TotalSampleCount); |
| BC.outs() << format( |
| "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" |
| " (%zu out of %zu stale) responsible for %.2f%% samples" |
| " (%zu out of %zu stale)\n", |
| 100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks, |
| BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks, |
| 100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount, |
| BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount); |
| BC.outs() << format( |
| "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of " |
| "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" |
| " (%zu out of %zu stale)\n", |
| 100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks / |
| BC.Stats.NumStaleBlocks, |
| BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks, |
| 100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount / |
| BC.Stats.StaleSampleCount, |
| BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount); |
| BC.outs() << format( |
| "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of " |
| "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" |
| " (%zu out of %zu stale)\n", |
| 100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks / |
| BC.Stats.NumStaleBlocks, |
| BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks, |
| 100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount / |
| BC.Stats.StaleSampleCount, |
| BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount); |
| BC.outs() << format( |
| "BOLT-INFO: inference found a call match for %.2f%% of basic " |
| "blocks" |
| " (%zu out of %zu stale) responsible for %.2f%% samples" |
| " (%zu out of %zu stale)\n", |
| 100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks, |
| BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks, |
| 100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount, |
| BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount); |
| BC.outs() << format( |
| "BOLT-INFO: inference found a loose match for %.2f%% of basic " |
| "blocks" |
| " (%zu out of %zu stale) responsible for %.2f%% samples" |
| " (%zu out of %zu stale)\n", |
| 100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks, |
| BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks, |
| 100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount, |
| BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount); |
| } |
| |
| if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { |
| BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects |
| << " objects was ignored\n"; |
| } |
| |
| if (ProfiledFunctions.size() > 10) { |
| if (opts::Verbosity >= 1) { |
| BC.outs() << "BOLT-INFO: top called functions are:\n"; |
| llvm::sort(ProfiledFunctions, |
| [](const BinaryFunction *A, const BinaryFunction *B) { |
| return B->getExecutionCount() < A->getExecutionCount(); |
| }); |
| auto SFI = ProfiledFunctions.begin(); |
| auto SFIend = ProfiledFunctions.end(); |
| for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; |
| ++SFI, ++I) |
| BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() |
| << '\n'; |
| } |
| } |
| |
| if (!opts::PrintSortedBy.empty()) { |
| std::vector<BinaryFunction *> Functions; |
| std::map<const BinaryFunction *, DynoStats> Stats; |
| |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| BinaryFunction &BF = BFI.second; |
| if (shouldOptimize(BF) && BF.hasValidProfile()) { |
| Functions.push_back(&BF); |
| Stats.emplace(&BF, getDynoStats(BF)); |
| } |
| } |
| |
| const bool SortAll = |
| llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT); |
| |
| const bool Ascending = |
| opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; |
| |
| std::function<bool(const DynoStats &, const DynoStats &)> |
| DynoStatsComparator = |
| SortAll ? [](const DynoStats &StatsA, |
| const DynoStats &StatsB) { return StatsA < StatsB; } |
| : [](const DynoStats &StatsA, const DynoStats &StatsB) { |
| return StatsA.lessThan(StatsB, opts::PrintSortedBy); |
| }; |
| |
| llvm::stable_sort(Functions, |
| [Ascending, &Stats, DynoStatsComparator]( |
| const BinaryFunction *A, const BinaryFunction *B) { |
| auto StatsItr = Stats.find(A); |
| assert(StatsItr != Stats.end()); |
| const DynoStats &StatsA = StatsItr->second; |
| |
| StatsItr = Stats.find(B); |
| assert(StatsItr != Stats.end()); |
| const DynoStats &StatsB = StatsItr->second; |
| |
| return Ascending ? DynoStatsComparator(StatsA, StatsB) |
| : DynoStatsComparator(StatsB, StatsA); |
| }); |
| |
| BC.outs() << "BOLT-INFO: top functions sorted by "; |
| if (SortAll) { |
| BC.outs() << "dyno stats"; |
| } else { |
| BC.outs() << "("; |
| bool PrintComma = false; |
| for (const DynoStats::Category Category : opts::PrintSortedBy) { |
| if (PrintComma) |
| BC.outs() << ", "; |
| BC.outs() << DynoStats::Description(Category); |
| PrintComma = true; |
| } |
| BC.outs() << ")"; |
| } |
| |
| BC.outs() << " are:\n"; |
| auto SFI = Functions.begin(); |
| for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { |
| const DynoStats Stats = getDynoStats(**SFI); |
| BC.outs() << " " << **SFI; |
| if (!SortAll) { |
| BC.outs() << " ("; |
| bool PrintComma = false; |
| for (const DynoStats::Category Category : opts::PrintSortedBy) { |
| if (PrintComma) |
| BC.outs() << ", "; |
| BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; |
| PrintComma = true; |
| } |
| BC.outs() << ")"; |
| } |
| BC.outs() << "\n"; |
| } |
| } |
| |
| if (!BC.TrappedFunctions.empty()) { |
| BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" |
| << (BC.TrappedFunctions.size() > 1 ? "s" : "") |
| << " will trap on entry. Use -trap-avx512=0 to disable" |
| " traps."; |
| if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { |
| BC.errs() << '\n'; |
| for (const BinaryFunction *Function : BC.TrappedFunctions) |
| BC.errs() << " " << *Function << '\n'; |
| } else { |
| BC.errs() << " Use -v=1 to see the list.\n"; |
| } |
| } |
| |
| // Collect and print information about suboptimal code layout on input. |
| if (opts::ReportBadLayout) { |
| std::vector<BinaryFunction *> SuboptimalFuncs; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| BinaryFunction &BF = BFI.second; |
| if (!BF.hasValidProfile()) |
| continue; |
| |
| const uint64_t HotThreshold = |
| std::max<uint64_t>(BF.getKnownExecutionCount(), 1); |
| bool HotSeen = false; |
| for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { |
| if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { |
| HotSeen = true; |
| continue; |
| } |
| if (HotSeen && BB->getKnownExecutionCount() == 0) { |
| SuboptimalFuncs.push_back(&BF); |
| break; |
| } |
| } |
| } |
| |
| if (!SuboptimalFuncs.empty()) { |
| llvm::sort(SuboptimalFuncs, |
| [](const BinaryFunction *A, const BinaryFunction *B) { |
| return A->getKnownExecutionCount() / A->getSize() > |
| B->getKnownExecutionCount() / B->getSize(); |
| }); |
| |
| BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() |
| << " functions have " |
| "cold code in the middle of hot code. Top functions are:\n"; |
| for (unsigned I = 0; |
| I < std::min(static_cast<size_t>(opts::ReportBadLayout), |
| SuboptimalFuncs.size()); |
| ++I) |
| SuboptimalFuncs[I]->print(BC.outs()); |
| } |
| } |
| |
| if (NumUnknownControlFlowFunctions) { |
| BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions |
| << " functions have instructions with unknown control flow"; |
| if (!opts::PrintUnknown) |
| BC.outs() << ". Use -print-unknown to see the list."; |
| BC.outs() << '\n'; |
| } |
| |
| if (opts::ShowDensity) { |
| double Density = 0.0; |
| // Sorted by the density in descending order. |
| llvm::stable_sort(FuncDensityList, |
| [&](const std::pair<double, uint64_t> &A, |
| const std::pair<double, uint64_t> &B) { |
| if (A.first != B.first) |
| return A.first > B.first; |
| return A.second < B.second; |
| }); |
| |
| uint64_t AccumulatedSamples = 0; |
| uint32_t I = 0; |
| assert(opts::ProfileDensityCutOffHot <= 1000000 && |
| "The cutoff value is greater than 1000000(100%)"); |
| while (AccumulatedSamples < |
| TotalSampleCount * |
| static_cast<float>(opts::ProfileDensityCutOffHot) / |
| 1000000 && |
| I < FuncDensityList.size()) { |
| AccumulatedSamples += FuncDensityList[I].second; |
| Density = FuncDensityList[I].first; |
| I++; |
| } |
| if (Density == 0.0) { |
| BC.errs() << "BOLT-WARNING: the output profile is empty or the " |
| "--profile-density-cutoff-hot option is " |
| "set too low. Please check your command.\n"; |
| } else if (Density < opts::ProfileDensityThreshold) { |
| BC.errs() |
| << "BOLT-WARNING: BOLT is estimated to optimize better with " |
| << format("%.1f", opts::ProfileDensityThreshold / Density) |
| << "x more samples. Please consider increasing sampling rate or " |
| "profiling for longer duration to get more samples.\n"; |
| } |
| |
| BC.outs() << "BOLT-INFO: Functions with density >= " |
| << format("%.1f", Density) << " account for " |
| << format("%.2f", |
| static_cast<double>(opts::ProfileDensityCutOffHot) / |
| 10000) |
| << "% total sample counts.\n"; |
| } |
| return Error::success(); |
| } |
| |
| Error InstructionLowering::runOnFunctions(BinaryContext &BC) { |
| for (auto &BFI : BC.getBinaryFunctions()) |
| for (BinaryBasicBlock &BB : BFI.second) |
| for (MCInst &Instruction : BB) |
| BC.MIB->lowerTailCall(Instruction); |
| return Error::success(); |
| } |
| |
| Error StripRepRet::runOnFunctions(BinaryContext &BC) { |
| if (!BC.isX86()) |
| return Error::success(); |
| |
| uint64_t NumPrefixesRemoved = 0; |
| uint64_t NumBytesSaved = 0; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| for (BinaryBasicBlock &BB : BFI.second) { |
| auto LastInstRIter = BB.getLastNonPseudo(); |
| if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) || |
| !BC.MIB->deleteREPPrefix(*LastInstRIter)) |
| continue; |
| |
| NumPrefixesRemoved += BB.getKnownExecutionCount(); |
| ++NumBytesSaved; |
| } |
| } |
| |
| if (NumBytesSaved) |
| BC.outs() << "BOLT-INFO: removed " << NumBytesSaved |
| << " 'repz' prefixes" |
| " with estimated execution count of " |
| << NumPrefixesRemoved << " times.\n"; |
| return Error::success(); |
| } |
| |
| Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { |
| if (!BC.isX86()) |
| return Error::success(); |
| |
| uint64_t NumInlined = 0; |
| uint64_t NumInlinedDyno = 0; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| for (BinaryBasicBlock &BB : BFI.second) { |
| for (auto II = BB.begin(); II != BB.end(); ++II) { |
| MCInst &Inst = *II; |
| |
| if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
| !Inst.getOperand(0).isExpr()) |
| continue; |
| |
| const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
| if (CalleeSymbol->getName() != "memcpy" && |
| CalleeSymbol->getName() != "memcpy@PLT" && |
| CalleeSymbol->getName() != "_memcpy8") |
| continue; |
| |
| const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); |
| const bool IsTailCall = BC.MIB->isTailCall(Inst); |
| |
| const InstructionListType NewCode = |
| BC.MIB->createInlineMemcpy(IsMemcpy8); |
| II = BB.replaceInstruction(II, NewCode); |
| std::advance(II, NewCode.size() - 1); |
| if (IsTailCall) { |
| MCInst Return; |
| BC.MIB->createReturn(Return); |
| II = BB.insertInstruction(std::next(II), std::move(Return)); |
| } |
| |
| ++NumInlined; |
| NumInlinedDyno += BB.getKnownExecutionCount(); |
| } |
| } |
| } |
| |
| if (NumInlined) { |
| BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls"; |
| if (NumInlinedDyno) |
| BC.outs() << ". The calls were executed " << NumInlinedDyno |
| << " times based on profile."; |
| BC.outs() << '\n'; |
| } |
| return Error::success(); |
| } |
| |
| bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { |
| if (!BinaryFunctionPass::shouldOptimize(Function)) |
| return false; |
| |
| for (const std::string &FunctionSpec : Spec) { |
| StringRef FunctionName = StringRef(FunctionSpec).split(':').first; |
| if (Function.hasNameRegex(FunctionName)) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( |
| const BinaryFunction &Function) const { |
| StringRef SitesString; |
| for (const std::string &FunctionSpec : Spec) { |
| StringRef FunctionName; |
| std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':'); |
| if (Function.hasNameRegex(FunctionName)) |
| break; |
| SitesString = ""; |
| } |
| |
| std::set<size_t> Sites; |
| SmallVector<StringRef, 4> SitesVec; |
| SitesString.split(SitesVec, ':'); |
| for (StringRef SiteString : SitesVec) { |
| if (SiteString.empty()) |
| continue; |
| size_t Result; |
| if (!SiteString.getAsInteger(10, Result)) |
| Sites.emplace(Result); |
| } |
| |
| return Sites; |
| } |
| |
| Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { |
| if (!BC.isX86()) |
| return Error::success(); |
| |
| uint64_t NumSpecialized = 0; |
| uint64_t NumSpecializedDyno = 0; |
| for (auto &BFI : BC.getBinaryFunctions()) { |
| BinaryFunction &Function = BFI.second; |
| if (!shouldOptimize(Function)) |
| continue; |
| |
| std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); |
| auto shouldOptimize = [&](size_t N) { |
| return CallsToOptimize.empty() || CallsToOptimize.count(N); |
| }; |
| |
| std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); |
| size_t CallSiteID = 0; |
| for (BinaryBasicBlock *CurBB : Blocks) { |
| for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { |
| MCInst &Inst = *II; |
| |
| if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
| !Inst.getOperand(0).isExpr()) |
| continue; |
| |
| const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
| if (CalleeSymbol->getName() != "memcpy" && |
| CalleeSymbol->getName() != "memcpy@PLT") |
| continue; |
| |
| if (BC.MIB->isTailCall(Inst)) |
| continue; |
| |
| ++CallSiteID; |
| |
| if (!shouldOptimize(CallSiteID)) |
| continue; |
| |
| // Create a copy of a call to memcpy(dest, src, size). |
| MCInst MemcpyInstr = Inst; |
| |
| BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); |
| |
| BinaryBasicBlock *NextBB = nullptr; |
| if (OneByteMemcpyBB->getNumNonPseudos() > 1) { |
| NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin()); |
| NextBB->eraseInstruction(NextBB->begin()); |
| } else { |
| NextBB = OneByteMemcpyBB->getSuccessor(); |
| OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin()); |
| assert(NextBB && "unexpected call to memcpy() with no return"); |
| } |
| |
| BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); |
| MemcpyBB->setOffset(CurBB->getInputOffset()); |
| InstructionListType CmpJCC = |
| BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1, |
| OneByteMemcpyBB->getLabel(), BC.Ctx.get()); |
| CurBB->addInstructions(CmpJCC); |
| CurBB->addSuccessor(MemcpyBB); |
| |
| MemcpyBB->addInstruction(std::move(MemcpyInstr)); |
| MemcpyBB->addSuccessor(NextBB); |
| MemcpyBB->setCFIState(NextBB->getCFIState()); |
| MemcpyBB->setExecutionCount(0); |
| |
| // To prevent the actual call from being moved to cold, we set its |
| // execution count to 1. |
| if (CurBB->getKnownExecutionCount() > 0) |
| MemcpyBB->setExecutionCount(1); |
| |
| InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); |
| OneByteMemcpyBB->addInstructions(OneByteMemcpy); |
| |
| ++NumSpecialized; |
| NumSpecializedDyno += CurBB->getKnownExecutionCount(); |
| |
| CurBB = NextBB; |
| |
| // Note: we don't expect the next instruction to be a call to memcpy. |
| II = CurBB->begin(); |
| } |
| } |
| } |
| |
| if (NumSpecialized) { |
| BC.outs() << "BOLT-INFO: specialized " << NumSpecialized |
| << " memcpy() call sites for size 1"; |
| if (NumSpecializedDyno) |
| BC.outs() << ". The calls were executed " << NumSpecializedDyno |
| << " times based on profile."; |
| BC.outs() << '\n'; |
| } |
| return Error::success(); |
| } |
| |
| void RemoveNops::runOnFunction(BinaryFunction &BF) { |
| const BinaryContext &BC = BF.getBinaryContext(); |
| for (BinaryBasicBlock &BB : BF) { |
| for (int64_t I = BB.size() - 1; I >= 0; --I) { |
| MCInst &Inst = BB.getInstructionAtIndex(I); |
| if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP")) |
| BB.eraseInstructionAtIndex(I); |
| } |
| } |
| } |
| |
| Error RemoveNops::runOnFunctions(BinaryContext &BC) { |
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| runOnFunction(BF); |
| }; |
| |
| ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| return BF.shouldPreserveNops(); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, |
| SkipFunc, "RemoveNops"); |
| return Error::success(); |
| } |
| |
| } // namespace bolt |
| } // namespace llvm |