1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/BlockFrequencyInfo.h"
24 #include "llvm/Analysis/BranchProbabilityInfo.h"
25 #include "llvm/Analysis/InlineCost.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
28 #include "llvm/Analysis/ProfileSummaryInfo.h"
29 #include "llvm/Analysis/TargetLibraryInfo.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/IR/Attributes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/InstrTypes.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/IntrinsicInst.h"
43 #include "llvm/IR/Intrinsics.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/BlockFrequency.h"
48 #include "llvm/Support/BranchProbability.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/Utils/Cloning.h"
54 #include "llvm/Transforms/Utils/CodeExtractor.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <functional>
60 #include <iterator>
61 #include <memory>
62 #include <tuple>
63 #include <vector>
64
65 using namespace llvm;
66
67 #define DEBUG_TYPE "partial-inlining"
68
69 STATISTIC(NumPartialInlined,
70 "Number of callsites functions partially inlined into.");
71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72 "cold outlined regions were partially "
73 "inlined into its caller(s).");
74 STATISTIC(NumColdRegionsFound,
75 "Number of cold single entry/exit regions found.");
76 STATISTIC(NumColdRegionsOutlined,
77 "Number of cold single entry/exit regions outlined.");
78
79 // Command line option to disable partial-inlining. The default is false:
80 static cl::opt<bool>
81 DisablePartialInlining("disable-partial-inlining", cl::init(false),
82 cl::Hidden, cl::desc("Disable partial inlining"));
83 // Command line option to disable multi-region partial-inlining. The default is
84 // false:
85 static cl::opt<bool> DisableMultiRegionPartialInline(
86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87 cl::desc("Disable multi-region partial inlining"));
88
89 // Command line option to force outlining in regions with live exit variables.
90 // The default is false:
91 static cl::opt<bool>
92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93 cl::desc("Force outline regions with live exits"));
94
95 // Command line option to enable marking outline functions with Cold Calling
96 // Convention. The default is false:
97 static cl::opt<bool>
98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99 cl::desc("Mark outline function calls with ColdCC"));
100
101 #ifndef NDEBUG
102 // Command line option to debug partial-inlining. The default is none:
103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
104 cl::init(false), cl::Hidden,
105 cl::desc("Trace partial inlining."));
106 #endif
107
108 // This is an option used by testing:
109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
110 cl::init(false), cl::ZeroOrMore,
111 cl::ReallyHidden,
112 cl::desc("Skip Cost Analysis"));
113 // Used to determine if a cold region is worth outlining based on
114 // its inlining cost compared to the original function. Default is set at 10%.
115 // ie. if the cold region reduces the inlining cost of the original function by
116 // at least 10%.
117 static cl::opt<float> MinRegionSizeRatio(
118 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
119 cl::desc("Minimum ratio comparing relative sizes of each "
120 "outline candidate and original function"));
121 // Used to tune the minimum number of execution counts needed in the predecessor
122 // block to the cold edge. ie. confidence interval.
123 static cl::opt<unsigned>
124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
125 cl::desc("Minimum block executions to consider "
126 "its BranchProbabilityInfo valid"));
127 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
128 // if the branch probability is 10% or less, then it is deemed as 'cold'.
129 static cl::opt<float> ColdBranchRatio(
130 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
131 cl::desc("Minimum BranchProbability to consider a region cold."));
132
133 static cl::opt<unsigned> MaxNumInlineBlocks(
134 "max-num-inline-blocks", cl::init(5), cl::Hidden,
135 cl::desc("Max number of blocks to be partially inlined"));
136
137 // Command line option to set the maximum number of partial inlining allowed
138 // for the module. The default value of -1 means no limit.
139 static cl::opt<int> MaxNumPartialInlining(
140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
141 cl::desc("Max number of partial inlining. The default is unlimited"));
142
143 // Used only when PGO or user annotated branch data is absent. It is
144 // the least value that is used to weigh the outline region. If BFI
145 // produces larger value, the BFI value will be used.
146 static cl::opt<int>
147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
148 cl::Hidden, cl::ZeroOrMore,
149 cl::desc("Relative frequency of outline region to "
150 "the entry block"));
151
152 static cl::opt<unsigned> ExtraOutliningPenalty(
153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
154 cl::desc("A debug option to add additional penalty to the computed one."));
155
156 namespace {
157
158 struct FunctionOutliningInfo {
159 FunctionOutliningInfo() = default;
160
161 // Returns the number of blocks to be inlined including all blocks
162 // in Entries and one return block.
GetNumInlinedBlocks__anond0d623640111::FunctionOutliningInfo163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
164
165 // A set of blocks including the function entry that guard
166 // the region to be outlined.
167 SmallVector<BasicBlock *, 4> Entries;
168
169 // The return block that is not included in the outlined region.
170 BasicBlock *ReturnBlock = nullptr;
171
172 // The dominating block of the region to be outlined.
173 BasicBlock *NonReturnBlock = nullptr;
174
175 // The set of blocks in Entries that that are predecessors to ReturnBlock
176 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
177 };
178
179 struct FunctionOutliningMultiRegionInfo {
FunctionOutliningMultiRegionInfo__anond0d623640111::FunctionOutliningMultiRegionInfo180 FunctionOutliningMultiRegionInfo()
181 : ORI() {}
182
183 // Container for outline regions
184 struct OutlineRegionInfo {
OutlineRegionInfo__anond0d623640111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo185 OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region,
186 BasicBlock *EntryBlock, BasicBlock *ExitBlock,
187 BasicBlock *ReturnBlock)
188 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
189 ReturnBlock(ReturnBlock) {}
190 SmallVector<BasicBlock *, 8> Region;
191 BasicBlock *EntryBlock;
192 BasicBlock *ExitBlock;
193 BasicBlock *ReturnBlock;
194 };
195
196 SmallVector<OutlineRegionInfo, 4> ORI;
197 };
198
199 struct PartialInlinerImpl {
200
PartialInlinerImpl__anond0d623640111::PartialInlinerImpl201 PartialInlinerImpl(
202 std::function<AssumptionCache &(Function &)> *GetAC,
203 std::function<TargetTransformInfo &(Function &)> *GTTI,
204 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
205 ProfileSummaryInfo *ProfSI)
206 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
207
208 bool run(Module &M);
209 // Main part of the transformation that calls helper functions to find
210 // outlining candidates, clone & outline the function, and attempt to
211 // partially inline the resulting function. Returns true if
212 // inlining was successful, false otherwise. Also returns the outline
213 // function (only if we partially inlined early returns) as there is a
214 // possibility to further "peel" early return statements that were left in the
215 // outline function due to code size.
216 std::pair<bool, Function *> unswitchFunction(Function *F);
217
218 // This class speculatively clones the function to be partial inlined.
219 // At the end of partial inlining, the remaining callsites to the cloned
220 // function that are not partially inlined will be fixed up to reference
221 // the original function, and the cloned function will be erased.
222 struct FunctionCloner {
223 // Two constructors, one for single region outlining, the other for
224 // multi-region outlining.
225 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
226 OptimizationRemarkEmitter &ORE);
227 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
228 OptimizationRemarkEmitter &ORE);
229 ~FunctionCloner();
230
231 // Prepare for function outlining: making sure there is only
232 // one incoming edge from the extracted/outlined region to
233 // the return block.
234 void NormalizeReturnBlock();
235
236 // Do function outlining for cold regions.
237 bool doMultiRegionFunctionOutlining();
238 // Do function outlining for region after early return block(s).
239 // NOTE: For vararg functions that do the vararg handling in the outlined
240 // function, we temporarily generate IR that does not properly
241 // forward varargs to the outlined function. Calling InlineFunction
242 // will update calls to the outlined functions to properly forward
243 // the varargs.
244 Function *doSingleRegionFunctionOutlining();
245
246 Function *OrigFunc = nullptr;
247 Function *ClonedFunc = nullptr;
248
249 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250 // Keep track of Outlined Functions and the basic block they're called from.
251 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
252
253 // ClonedFunc is inlined in one of its callers after function
254 // outlining.
255 bool IsFunctionInlined = false;
256 // The cost of the region to be outlined.
257 int OutlinedRegionCost = 0;
258 // ClonedOI is specific to outlining non-early return blocks.
259 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
260 // ClonedOMRI is specific to outlining cold regions.
261 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
262 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
263 OptimizationRemarkEmitter &ORE;
264 };
265
266 private:
267 int NumPartialInlining = 0;
268 std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
269 std::function<TargetTransformInfo &(Function &)> *GetTTI;
270 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
271 ProfileSummaryInfo *PSI;
272
273 // Return the frequency of the OutlininingBB relative to F's entry point.
274 // The result is no larger than 1 and is represented using BP.
275 // (Note that the outlined region's 'head' block can only have incoming
276 // edges from the guarding entry blocks).
277 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
278
279 // Return true if the callee of CS should be partially inlined with
280 // profit.
281 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
282 BlockFrequency WeightedOutliningRcost,
283 OptimizationRemarkEmitter &ORE);
284
285 // Try to inline DuplicateFunction (cloned from F with call to
286 // the OutlinedFunction into its callers. Return true
287 // if there is any successful inlining.
288 bool tryPartialInline(FunctionCloner &Cloner);
289
290 // Compute the mapping from use site of DuplicationFunction to the enclosing
291 // BB's profile count.
292 void computeCallsiteToProfCountMap(Function *DuplicateFunction,
293 DenseMap<User *, uint64_t> &SiteCountMap);
294
IsLimitReached__anond0d623640111::PartialInlinerImpl295 bool IsLimitReached() {
296 return (MaxNumPartialInlining != -1 &&
297 NumPartialInlining >= MaxNumPartialInlining);
298 }
299
getCallSite__anond0d623640111::PartialInlinerImpl300 static CallSite getCallSite(User *U) {
301 CallSite CS;
302 if (CallInst *CI = dyn_cast<CallInst>(U))
303 CS = CallSite(CI);
304 else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
305 CS = CallSite(II);
306 else
307 llvm_unreachable("All uses must be calls");
308 return CS;
309 }
310
getOneCallSiteTo__anond0d623640111::PartialInlinerImpl311 static CallSite getOneCallSiteTo(Function *F) {
312 User *User = *F->user_begin();
313 return getCallSite(User);
314 }
315
getOneDebugLoc__anond0d623640111::PartialInlinerImpl316 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
317 CallSite CS = getOneCallSiteTo(F);
318 DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
319 BasicBlock *Block = CS.getParent();
320 return std::make_tuple(DLoc, Block);
321 }
322
323 // Returns the costs associated with function outlining:
324 // - The first value is the non-weighted runtime cost for making the call
325 // to the outlined function, including the addtional setup cost in the
326 // outlined function itself;
327 // - The second value is the estimated size of the new call sequence in
328 // basic block Cloner.OutliningCallBB;
329 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
330
331 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332 // approximate both the size and runtime cost (Note that in the current
333 // inline cost analysis, there is no clear distinction there either).
334 static int computeBBInlineCost(BasicBlock *BB);
335
336 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
337 std::unique_ptr<FunctionOutliningMultiRegionInfo>
338 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
339 };
340
341 struct PartialInlinerLegacyPass : public ModulePass {
342 static char ID; // Pass identification, replacement for typeid
343
PartialInlinerLegacyPass__anond0d623640111::PartialInlinerLegacyPass344 PartialInlinerLegacyPass() : ModulePass(ID) {
345 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
346 }
347
getAnalysisUsage__anond0d623640111::PartialInlinerLegacyPass348 void getAnalysisUsage(AnalysisUsage &AU) const override {
349 AU.addRequired<AssumptionCacheTracker>();
350 AU.addRequired<ProfileSummaryInfoWrapperPass>();
351 AU.addRequired<TargetTransformInfoWrapperPass>();
352 }
353
runOnModule__anond0d623640111::PartialInlinerLegacyPass354 bool runOnModule(Module &M) override {
355 if (skipModule(M))
356 return false;
357
358 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
359 TargetTransformInfoWrapperPass *TTIWP =
360 &getAnalysis<TargetTransformInfoWrapperPass>();
361 ProfileSummaryInfo *PSI =
362 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
363
364 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
365 [&ACT](Function &F) -> AssumptionCache & {
366 return ACT->getAssumptionCache(F);
367 };
368
369 std::function<TargetTransformInfo &(Function &)> GetTTI =
370 [&TTIWP](Function &F) -> TargetTransformInfo & {
371 return TTIWP->getTTI(F);
372 };
373
374 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI)
375 .run(M);
376 }
377 };
378
379 } // end anonymous namespace
380
381 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function * F,OptimizationRemarkEmitter & ORE)382 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
383 OptimizationRemarkEmitter &ORE) {
384 BasicBlock *EntryBlock = &F->front();
385
386 DominatorTree DT(*F);
387 LoopInfo LI(DT);
388 BranchProbabilityInfo BPI(*F, LI);
389 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
390 BlockFrequencyInfo *BFI;
391 if (!GetBFI) {
392 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
393 BFI = ScopedBFI.get();
394 } else
395 BFI = &(*GetBFI)(*F);
396
397 // Return if we don't have profiling information.
398 if (!PSI->hasInstrumentationProfile())
399 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
400
401 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
402 llvm::make_unique<FunctionOutliningMultiRegionInfo>();
403
404 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
405 BasicBlock *Dom = BlockList.front();
406 return BlockList.size() > 1 && pred_size(Dom) == 1;
407 };
408
409 auto IsSingleExit =
410 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
411 BasicBlock *ExitBlock = nullptr;
412 for (auto *Block : BlockList) {
413 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
414 if (!is_contained(BlockList, *SI)) {
415 if (ExitBlock) {
416 ORE.emit([&]() {
417 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
418 &SI->front())
419 << "Region dominated by "
420 << ore::NV("Block", BlockList.front()->getName())
421 << " has more than one region exit edge.";
422 });
423 return nullptr;
424 } else
425 ExitBlock = Block;
426 }
427 }
428 }
429 return ExitBlock;
430 };
431
432 auto BBProfileCount = [BFI](BasicBlock *BB) {
433 return BFI->getBlockProfileCount(BB)
434 ? BFI->getBlockProfileCount(BB).getValue()
435 : 0;
436 };
437
438 // Use the same computeBBInlineCost function to compute the cost savings of
439 // the outlining the candidate region.
440 int OverallFunctionCost = 0;
441 for (auto &BB : *F)
442 OverallFunctionCost += computeBBInlineCost(&BB);
443
444 #ifndef NDEBUG
445 if (TracePartialInlining)
446 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
447 #endif
448 int MinOutlineRegionCost =
449 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
450 BranchProbability MinBranchProbability(
451 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
452 MinBlockCounterExecution);
453 bool ColdCandidateFound = false;
454 BasicBlock *CurrEntry = EntryBlock;
455 std::vector<BasicBlock *> DFS;
456 DenseMap<BasicBlock *, bool> VisitedMap;
457 DFS.push_back(CurrEntry);
458 VisitedMap[CurrEntry] = true;
459 // Use Depth First Search on the basic blocks to find CFG edges that are
460 // considered cold.
461 // Cold regions considered must also have its inline cost compared to the
462 // overall inline cost of the original function. The region is outlined only
463 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
464 // more.
465 while (!DFS.empty()) {
466 auto *thisBB = DFS.back();
467 DFS.pop_back();
468 // Only consider regions with predecessor blocks that are considered
469 // not-cold (default: part of the top 99.99% of all block counters)
470 // AND greater than our minimum block execution count (default: 100).
471 if (PSI->isColdBB(thisBB, BFI) ||
472 BBProfileCount(thisBB) < MinBlockCounterExecution)
473 continue;
474 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
475 if (VisitedMap[*SI])
476 continue;
477 VisitedMap[*SI] = true;
478 DFS.push_back(*SI);
479 // If branch isn't cold, we skip to the next one.
480 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
481 if (SuccProb > MinBranchProbability)
482 continue;
483 #ifndef NDEBUG
484 if (TracePartialInlining) {
485 dbgs() << "Found cold edge: " << thisBB->getName() << "->"
486 << (*SI)->getName() << "\nBranch Probability = " << SuccProb
487 << "\n";
488 }
489 #endif
490 SmallVector<BasicBlock *, 8> DominateVector;
491 DT.getDescendants(*SI, DominateVector);
492 // We can only outline single entry regions (for now).
493 if (!IsSingleEntry(DominateVector))
494 continue;
495 BasicBlock *ExitBlock = nullptr;
496 // We can only outline single exit regions (for now).
497 if (!(ExitBlock = IsSingleExit(DominateVector)))
498 continue;
499 int OutlineRegionCost = 0;
500 for (auto *BB : DominateVector)
501 OutlineRegionCost += computeBBInlineCost(BB);
502
503 #ifndef NDEBUG
504 if (TracePartialInlining)
505 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
506 #endif
507
508 if (OutlineRegionCost < MinOutlineRegionCost) {
509 ORE.emit([&]() {
510 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
511 &SI->front())
512 << ore::NV("Callee", F) << " inline cost-savings smaller than "
513 << ore::NV("Cost", MinOutlineRegionCost);
514 });
515 continue;
516 }
517 // For now, ignore blocks that belong to a SISE region that is a
518 // candidate for outlining. In the future, we may want to look
519 // at inner regions because the outer region may have live-exit
520 // variables.
521 for (auto *BB : DominateVector)
522 VisitedMap[BB] = true;
523 // ReturnBlock here means the block after the outline call
524 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
525 // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
526 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
527 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
528 RegInfo.Region = DominateVector;
529 OutliningInfo->ORI.push_back(RegInfo);
530 #ifndef NDEBUG
531 if (TracePartialInlining) {
532 dbgs() << "Found Cold Candidate starting at block: "
533 << DominateVector.front()->getName() << "\n";
534 }
535 #endif
536 ColdCandidateFound = true;
537 NumColdRegionsFound++;
538 }
539 }
540 if (ColdCandidateFound)
541 return OutliningInfo;
542 else
543 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
544 }
545
546 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function * F)547 PartialInlinerImpl::computeOutliningInfo(Function *F) {
548 BasicBlock *EntryBlock = &F->front();
549 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
550 if (!BR || BR->isUnconditional())
551 return std::unique_ptr<FunctionOutliningInfo>();
552
553 // Returns true if Succ is BB's successor
554 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
555 return is_contained(successors(BB), Succ);
556 };
557
558 auto IsReturnBlock = [](BasicBlock *BB) {
559 TerminatorInst *TI = BB->getTerminator();
560 return isa<ReturnInst>(TI);
561 };
562
563 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
564 if (IsReturnBlock(Succ1))
565 return std::make_tuple(Succ1, Succ2);
566 if (IsReturnBlock(Succ2))
567 return std::make_tuple(Succ2, Succ1);
568
569 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
570 };
571
572 // Detect a triangular shape:
573 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574 if (IsSuccessor(Succ1, Succ2))
575 return std::make_tuple(Succ1, Succ2);
576 if (IsSuccessor(Succ2, Succ1))
577 return std::make_tuple(Succ2, Succ1);
578
579 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580 };
581
582 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
583 llvm::make_unique<FunctionOutliningInfo>();
584
585 BasicBlock *CurrEntry = EntryBlock;
586 bool CandidateFound = false;
587 do {
588 // The number of blocks to be inlined has already reached
589 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
590 // disables partial inlining for the function.
591 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
592 break;
593
594 if (succ_size(CurrEntry) != 2)
595 break;
596
597 BasicBlock *Succ1 = *succ_begin(CurrEntry);
598 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
599
600 BasicBlock *ReturnBlock, *NonReturnBlock;
601 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
602
603 if (ReturnBlock) {
604 OutliningInfo->Entries.push_back(CurrEntry);
605 OutliningInfo->ReturnBlock = ReturnBlock;
606 OutliningInfo->NonReturnBlock = NonReturnBlock;
607 CandidateFound = true;
608 break;
609 }
610
611 BasicBlock *CommSucc;
612 BasicBlock *OtherSucc;
613 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
614
615 if (!CommSucc)
616 break;
617
618 OutliningInfo->Entries.push_back(CurrEntry);
619 CurrEntry = OtherSucc;
620 } while (true);
621
622 if (!CandidateFound)
623 return std::unique_ptr<FunctionOutliningInfo>();
624
625 // Do sanity check of the entries: threre should not
626 // be any successors (not in the entry set) other than
627 // {ReturnBlock, NonReturnBlock}
628 assert(OutliningInfo->Entries[0] == &F->front() &&
629 "Function Entry must be the first in Entries vector");
630 DenseSet<BasicBlock *> Entries;
631 for (BasicBlock *E : OutliningInfo->Entries)
632 Entries.insert(E);
633
634 // Returns true of BB has Predecessor which is not
635 // in Entries set.
636 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
637 for (auto Pred : predecessors(BB)) {
638 if (!Entries.count(Pred))
639 return true;
640 }
641 return false;
642 };
643 auto CheckAndNormalizeCandidate =
644 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
645 for (BasicBlock *E : OutliningInfo->Entries) {
646 for (auto Succ : successors(E)) {
647 if (Entries.count(Succ))
648 continue;
649 if (Succ == OutliningInfo->ReturnBlock)
650 OutliningInfo->ReturnBlockPreds.push_back(E);
651 else if (Succ != OutliningInfo->NonReturnBlock)
652 return false;
653 }
654 // There should not be any outside incoming edges either:
655 if (HasNonEntryPred(E))
656 return false;
657 }
658 return true;
659 };
660
661 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
662 return std::unique_ptr<FunctionOutliningInfo>();
663
664 // Now further growing the candidate's inlining region by
665 // peeling off dominating blocks from the outlining region:
666 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
667 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
668 if (succ_size(Cand) != 2)
669 break;
670
671 if (HasNonEntryPred(Cand))
672 break;
673
674 BasicBlock *Succ1 = *succ_begin(Cand);
675 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
676
677 BasicBlock *ReturnBlock, *NonReturnBlock;
678 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
679 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
680 break;
681
682 if (NonReturnBlock->getSinglePredecessor() != Cand)
683 break;
684
685 // Now grow and update OutlininigInfo:
686 OutliningInfo->Entries.push_back(Cand);
687 OutliningInfo->NonReturnBlock = NonReturnBlock;
688 OutliningInfo->ReturnBlockPreds.push_back(Cand);
689 Entries.insert(Cand);
690 }
691
692 return OutliningInfo;
693 }
694
695 // Check if there is PGO data or user annoated branch data:
hasProfileData(Function * F,FunctionOutliningInfo * OI)696 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
697 if (F->hasProfileData())
698 return true;
699 // Now check if any of the entry block has MD_prof data:
700 for (auto *E : OI->Entries) {
701 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
702 if (!BR || BR->isUnconditional())
703 continue;
704 uint64_t T, F;
705 if (BR->extractProfMetadata(T, F))
706 return true;
707 }
708 return false;
709 }
710
711 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner)712 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
713 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
714 auto EntryFreq =
715 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
716 auto OutliningCallFreq =
717 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
718 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
719 // we outlined any regions, so we may encounter situations where the
720 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
721 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
722 OutliningCallFreq = EntryFreq;
723 }
724 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
725 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
726
727 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
728 return OutlineRegionRelFreq;
729
730 // When profile data is not available, we need to be conservative in
731 // estimating the overall savings. Static branch prediction can usually
732 // guess the branch direction right (taken/non-taken), but the guessed
733 // branch probability is usually not biased enough. In case when the
734 // outlined region is predicted to be likely, its probability needs
735 // to be made higher (more biased) to not under-estimate the cost of
736 // function outlining. On the other hand, if the outlined region
737 // is predicted to be less likely, the predicted probablity is usually
738 // higher than the actual. For instance, the actual probability of the
739 // less likely target is only 5%, but the guessed probablity can be
740 // 40%. In the latter case, there is no need for further adjustement.
741 // FIXME: add an option for this.
742 if (OutlineRegionRelFreq < BranchProbability(45, 100))
743 return OutlineRegionRelFreq;
744
745 OutlineRegionRelFreq = std::max(
746 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
747
748 return OutlineRegionRelFreq;
749 }
750
shouldPartialInline(CallSite CS,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE)751 bool PartialInlinerImpl::shouldPartialInline(
752 CallSite CS, FunctionCloner &Cloner,
753 BlockFrequency WeightedOutliningRcost,
754 OptimizationRemarkEmitter &ORE) {
755 using namespace ore;
756
757 Instruction *Call = CS.getInstruction();
758 Function *Callee = CS.getCalledFunction();
759 assert(Callee == Cloner.ClonedFunc);
760
761 if (SkipCostAnalysis)
762 return isInlineViable(*Callee);
763
764 Function *Caller = CS.getCaller();
765 auto &CalleeTTI = (*GetTTI)(*Callee);
766 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
767 *GetAssumptionCache, GetBFI, PSI, &ORE);
768
769 if (IC.isAlways()) {
770 ORE.emit([&]() {
771 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
772 << NV("Callee", Cloner.OrigFunc)
773 << " should always be fully inlined, not partially";
774 });
775 return false;
776 }
777
778 if (IC.isNever()) {
779 ORE.emit([&]() {
780 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
781 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
782 << NV("Caller", Caller)
783 << " because it should never be inlined (cost=never)";
784 });
785 return false;
786 }
787
788 if (!IC) {
789 ORE.emit([&]() {
790 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
791 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
792 << NV("Caller", Caller) << " because too costly to inline (cost="
793 << NV("Cost", IC.getCost()) << ", threshold="
794 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
795 });
796 return false;
797 }
798 const DataLayout &DL = Caller->getParent()->getDataLayout();
799
800 // The savings of eliminating the call:
801 int NonWeightedSavings = getCallsiteCost(CS, DL);
802 BlockFrequency NormWeightedSavings(NonWeightedSavings);
803
804 // Weighted saving is smaller than weighted cost, return false
805 if (NormWeightedSavings < WeightedOutliningRcost) {
806 ORE.emit([&]() {
807 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
808 Call)
809 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
810 << NV("Caller", Caller) << " runtime overhead (overhead="
811 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
812 << ", savings="
813 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
814 << ")"
815 << " of making the outlined call is too high";
816 });
817
818 return false;
819 }
820
821 ORE.emit([&]() {
822 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
823 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
824 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
825 << " (threshold="
826 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
827 });
828 return true;
829 }
830
831 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
832 // For now use a simplified version. The returned 'InlineCost' will be used
833 // to esimate the size cost as well as runtime cost of the BB.
computeBBInlineCost(BasicBlock * BB)834 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
835 int InlineCost = 0;
836 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
837 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
838 if (isa<DbgInfoIntrinsic>(I))
839 continue;
840
841 switch (I->getOpcode()) {
842 case Instruction::BitCast:
843 case Instruction::PtrToInt:
844 case Instruction::IntToPtr:
845 case Instruction::Alloca:
846 continue;
847 case Instruction::GetElementPtr:
848 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
849 continue;
850 break;
851 default:
852 break;
853 }
854
855 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
856 if (IntrInst) {
857 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
858 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
859 continue;
860 }
861
862 if (CallInst *CI = dyn_cast<CallInst>(I)) {
863 InlineCost += getCallsiteCost(CallSite(CI), DL);
864 continue;
865 }
866
867 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
868 InlineCost += getCallsiteCost(CallSite(II), DL);
869 continue;
870 }
871
872 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
873 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
874 continue;
875 }
876 InlineCost += InlineConstants::InstrCost;
877 }
878 return InlineCost;
879 }
880
881 std::tuple<int, int>
computeOutliningCosts(FunctionCloner & Cloner)882 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
883 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
884 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
885 Function *OutlinedFunc = FuncBBPair.first;
886 BasicBlock* OutliningCallBB = FuncBBPair.second;
887 // Now compute the cost of the call sequence to the outlined function
888 // 'OutlinedFunction' in BB 'OutliningCallBB':
889 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
890
891 // Now compute the cost of the extracted/outlined function itself:
892 for (BasicBlock &BB : *OutlinedFunc)
893 OutlinedFunctionCost += computeBBInlineCost(&BB);
894 }
895 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
896 "Outlined function cost should be no less than the outlined region");
897
898 // The code extractor introduces a new root and exit stub blocks with
899 // additional unconditional branches. Those branches will be eliminated
900 // later with bb layout. The cost should be adjusted accordingly:
901 OutlinedFunctionCost -=
902 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
903
904 int OutliningRuntimeOverhead =
905 OutliningFuncCallCost +
906 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
907 ExtraOutliningPenalty;
908
909 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
910 }
911
912 // Create the callsite to profile count map which is
913 // used to update the original function's entry count,
914 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap)915 void PartialInlinerImpl::computeCallsiteToProfCountMap(
916 Function *DuplicateFunction,
917 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
918 std::vector<User *> Users(DuplicateFunction->user_begin(),
919 DuplicateFunction->user_end());
920 Function *CurrentCaller = nullptr;
921 std::unique_ptr<BlockFrequencyInfo> TempBFI;
922 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
923
924 auto ComputeCurrBFI = [&,this](Function *Caller) {
925 // For the old pass manager:
926 if (!GetBFI) {
927 DominatorTree DT(*Caller);
928 LoopInfo LI(DT);
929 BranchProbabilityInfo BPI(*Caller, LI);
930 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
931 CurrentCallerBFI = TempBFI.get();
932 } else {
933 // New pass manager:
934 CurrentCallerBFI = &(*GetBFI)(*Caller);
935 }
936 };
937
938 for (User *User : Users) {
939 CallSite CS = getCallSite(User);
940 Function *Caller = CS.getCaller();
941 if (CurrentCaller != Caller) {
942 CurrentCaller = Caller;
943 ComputeCurrBFI(Caller);
944 } else {
945 assert(CurrentCallerBFI && "CallerBFI is not set");
946 }
947 BasicBlock *CallBB = CS.getInstruction()->getParent();
948 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
949 if (Count)
950 CallSiteToProfCountMap[User] = *Count;
951 else
952 CallSiteToProfCountMap[User] = 0;
953 }
954 }
955
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE)956 PartialInlinerImpl::FunctionCloner::FunctionCloner(
957 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE)
958 : OrigFunc(F), ORE(ORE) {
959 ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
960
961 // Clone the function, so that we can hack away on it.
962 ValueToValueMapTy VMap;
963 ClonedFunc = CloneFunction(F, VMap);
964
965 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
966 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
967 for (BasicBlock *BB : OI->Entries) {
968 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
969 }
970 for (BasicBlock *E : OI->ReturnBlockPreds) {
971 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
972 ClonedOI->ReturnBlockPreds.push_back(NewE);
973 }
974 // Go ahead and update all uses to the duplicate, so that we can just
975 // use the inliner functionality when we're done hacking.
976 F->replaceAllUsesWith(ClonedFunc);
977 }
978
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE)979 PartialInlinerImpl::FunctionCloner::FunctionCloner(
980 Function *F, FunctionOutliningMultiRegionInfo *OI,
981 OptimizationRemarkEmitter &ORE)
982 : OrigFunc(F), ORE(ORE) {
983 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
984
985 // Clone the function, so that we can hack away on it.
986 ValueToValueMapTy VMap;
987 ClonedFunc = CloneFunction(F, VMap);
988
989 // Go through all Outline Candidate Regions and update all BasicBlock
990 // information.
991 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
992 OI->ORI) {
993 SmallVector<BasicBlock *, 8> Region;
994 for (BasicBlock *BB : RegionInfo.Region) {
995 Region.push_back(cast<BasicBlock>(VMap[BB]));
996 }
997 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
998 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
999 BasicBlock *NewReturnBlock = nullptr;
1000 if (RegionInfo.ReturnBlock)
1001 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1002 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1003 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1004 ClonedOMRI->ORI.push_back(MappedRegionInfo);
1005 }
1006 // Go ahead and update all uses to the duplicate, so that we can just
1007 // use the inliner functionality when we're done hacking.
1008 F->replaceAllUsesWith(ClonedFunc);
1009 }
1010
NormalizeReturnBlock()1011 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1012 auto getFirstPHI = [](BasicBlock *BB) {
1013 BasicBlock::iterator I = BB->begin();
1014 PHINode *FirstPhi = nullptr;
1015 while (I != BB->end()) {
1016 PHINode *Phi = dyn_cast<PHINode>(I);
1017 if (!Phi)
1018 break;
1019 if (!FirstPhi) {
1020 FirstPhi = Phi;
1021 break;
1022 }
1023 }
1024 return FirstPhi;
1025 };
1026
1027 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1028 // blocks.
1029 if (!ClonedOI)
1030 return;
1031
1032 // Special hackery is needed with PHI nodes that have inputs from more than
1033 // one extracted block. For simplicity, just split the PHIs into a two-level
1034 // sequence of PHIs, some of which will go in the extracted region, and some
1035 // of which will go outside.
1036 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1037 // only split block when necessary:
1038 PHINode *FirstPhi = getFirstPHI(PreReturn);
1039 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1040
1041 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1042 return;
1043
1044 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1045 Value *CommonValue = PN->getIncomingValue(0);
1046 if (all_of(PN->incoming_values(),
1047 [&](Value *V) { return V == CommonValue; }))
1048 return CommonValue;
1049 return nullptr;
1050 };
1051
1052 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1053 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1054 BasicBlock::iterator I = PreReturn->begin();
1055 Instruction *Ins = &ClonedOI->ReturnBlock->front();
1056 SmallVector<Instruction *, 4> DeadPhis;
1057 while (I != PreReturn->end()) {
1058 PHINode *OldPhi = dyn_cast<PHINode>(I);
1059 if (!OldPhi)
1060 break;
1061
1062 PHINode *RetPhi =
1063 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1064 OldPhi->replaceAllUsesWith(RetPhi);
1065 Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1066
1067 RetPhi->addIncoming(&*I, PreReturn);
1068 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1069 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1070 OldPhi->removeIncomingValue(E);
1071 }
1072
1073 // After incoming values splitting, the old phi may become trivial.
1074 // Keeping the trivial phi can introduce definition inside the outline
1075 // region which is live-out, causing necessary overhead (load, store
1076 // arg passing etc).
1077 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1078 OldPhi->replaceAllUsesWith(OldPhiVal);
1079 DeadPhis.push_back(OldPhi);
1080 }
1081 ++I;
1082 }
1083 for (auto *DP : DeadPhis)
1084 DP->eraseFromParent();
1085
1086 for (auto E : ClonedOI->ReturnBlockPreds) {
1087 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1088 }
1089 }
1090
doMultiRegionFunctionOutlining()1091 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1092
1093 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1094 int Cost = 0;
1095 for (BasicBlock* BB : Region)
1096 Cost += computeBBInlineCost(BB);
1097 return Cost;
1098 };
1099
1100 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1101
1102 if (ClonedOMRI->ORI.empty())
1103 return false;
1104
1105 // The CodeExtractor needs a dominator tree.
1106 DominatorTree DT;
1107 DT.recalculate(*ClonedFunc);
1108
1109 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1110 LoopInfo LI(DT);
1111 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1112 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1113
1114 SetVector<Value *> Inputs, Outputs, Sinks;
1115 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1116 ClonedOMRI->ORI) {
1117 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1118
1119 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1120 ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false);
1121
1122 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1123
1124 #ifndef NDEBUG
1125 if (TracePartialInlining) {
1126 dbgs() << "inputs: " << Inputs.size() << "\n";
1127 dbgs() << "outputs: " << Outputs.size() << "\n";
1128 for (Value *value : Inputs)
1129 dbgs() << "value used in func: " << *value << "\n";
1130 for (Value *output : Outputs)
1131 dbgs() << "instr used in func: " << *output << "\n";
1132 }
1133 #endif
1134 // Do not extract regions that have live exit variables.
1135 if (Outputs.size() > 0 && !ForceLiveExit)
1136 continue;
1137
1138 Function *OutlinedFunc = CE.extractCodeRegion();
1139
1140 if (OutlinedFunc) {
1141 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1142 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1143 assert(OutliningCallBB->getParent() == ClonedFunc);
1144 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1145 NumColdRegionsOutlined++;
1146 OutlinedRegionCost += CurrentOutlinedRegionCost;
1147
1148 if (MarkOutlinedColdCC) {
1149 OutlinedFunc->setCallingConv(CallingConv::Cold);
1150 OCS.setCallingConv(CallingConv::Cold);
1151 }
1152 } else
1153 ORE.emit([&]() {
1154 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1155 &RegionInfo.Region.front()->front())
1156 << "Failed to extract region at block "
1157 << ore::NV("Block", RegionInfo.Region.front());
1158 });
1159 }
1160
1161 return !OutlinedFunctions.empty();
1162 }
1163
1164 Function *
doSingleRegionFunctionOutlining()1165 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1166 // Returns true if the block is to be partial inlined into the caller
1167 // (i.e. not to be extracted to the out of line function)
1168 auto ToBeInlined = [&, this](BasicBlock *BB) {
1169 return BB == ClonedOI->ReturnBlock ||
1170 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1171 ClonedOI->Entries.end());
1172 };
1173
1174 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1175 // The CodeExtractor needs a dominator tree.
1176 DominatorTree DT;
1177 DT.recalculate(*ClonedFunc);
1178
1179 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1180 LoopInfo LI(DT);
1181 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1182 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1183
1184 // Gather up the blocks that we're going to extract.
1185 std::vector<BasicBlock *> ToExtract;
1186 ToExtract.push_back(ClonedOI->NonReturnBlock);
1187 OutlinedRegionCost +=
1188 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1189 for (BasicBlock &BB : *ClonedFunc)
1190 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1191 ToExtract.push_back(&BB);
1192 // FIXME: the code extractor may hoist/sink more code
1193 // into the outlined function which may make the outlining
1194 // overhead (the difference of the outlined function cost
1195 // and OutliningRegionCost) look larger.
1196 OutlinedRegionCost += computeBBInlineCost(&BB);
1197 }
1198
1199 // Extract the body of the if.
1200 Function *OutlinedFunc =
1201 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1202 ClonedFuncBFI.get(), &BPI,
1203 /* AllowVarargs */ true)
1204 .extractCodeRegion();
1205
1206 if (OutlinedFunc) {
1207 BasicBlock *OutliningCallBB =
1208 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1209 .getInstruction()
1210 ->getParent();
1211 assert(OutliningCallBB->getParent() == ClonedFunc);
1212 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1213 } else
1214 ORE.emit([&]() {
1215 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1216 &ToExtract.front()->front())
1217 << "Failed to extract region at block "
1218 << ore::NV("Block", ToExtract.front());
1219 });
1220
1221 return OutlinedFunc;
1222 }
1223
~FunctionCloner()1224 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1225 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1226 // users (function pointers, etc.) back to the original function.
1227 ClonedFunc->replaceAllUsesWith(OrigFunc);
1228 ClonedFunc->eraseFromParent();
1229 if (!IsFunctionInlined) {
1230 // Remove each function that was speculatively created if there is no
1231 // reference.
1232 for (auto FuncBBPair : OutlinedFunctions) {
1233 Function *Func = FuncBBPair.first;
1234 Func->eraseFromParent();
1235 }
1236 }
1237 }
1238
unswitchFunction(Function * F)1239 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1240
1241 if (F->hasAddressTaken())
1242 return {false, nullptr};
1243
1244 // Let inliner handle it
1245 if (F->hasFnAttribute(Attribute::AlwaysInline))
1246 return {false, nullptr};
1247
1248 if (F->hasFnAttribute(Attribute::NoInline))
1249 return {false, nullptr};
1250
1251 if (PSI->isFunctionEntryCold(F))
1252 return {false, nullptr};
1253
1254 if (F->user_begin() == F->user_end())
1255 return {false, nullptr};
1256
1257 OptimizationRemarkEmitter ORE(F);
1258
1259 // Only try to outline cold regions if we have a profile summary, which
1260 // implies we have profiling information.
1261 if (PSI->hasProfileSummary() && F->hasProfileData() &&
1262 !DisableMultiRegionPartialInline) {
1263 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1264 computeOutliningColdRegionsInfo(F, ORE);
1265 if (OMRI) {
1266 FunctionCloner Cloner(F, OMRI.get(), ORE);
1267
1268 #ifndef NDEBUG
1269 if (TracePartialInlining) {
1270 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1271 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1272 << "\n";
1273 }
1274 #endif
1275 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1276
1277 if (DidOutline) {
1278 #ifndef NDEBUG
1279 if (TracePartialInlining) {
1280 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1281 Cloner.ClonedFunc->print(dbgs());
1282 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1283 }
1284 #endif
1285
1286 if (tryPartialInline(Cloner))
1287 return {true, nullptr};
1288 }
1289 }
1290 }
1291
1292 // Fall-thru to regular partial inlining if we:
1293 // i) can't find any cold regions to outline, or
1294 // ii) can't inline the outlined function anywhere.
1295 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1296 if (!OI)
1297 return {false, nullptr};
1298
1299 FunctionCloner Cloner(F, OI.get(), ORE);
1300 Cloner.NormalizeReturnBlock();
1301
1302 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1303
1304 if (!OutlinedFunction)
1305 return {false, nullptr};
1306
1307 bool AnyInline = tryPartialInline(Cloner);
1308
1309 if (AnyInline)
1310 return {true, OutlinedFunction};
1311
1312 return {false, nullptr};
1313 }
1314
tryPartialInline(FunctionCloner & Cloner)1315 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1316 if (Cloner.OutlinedFunctions.empty())
1317 return false;
1318
1319 int SizeCost = 0;
1320 BlockFrequency WeightedRcost;
1321 int NonWeightedRcost;
1322 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1323
1324 // Only calculate RelativeToEntryFreq when we are doing single region
1325 // outlining.
1326 BranchProbability RelativeToEntryFreq;
1327 if (Cloner.ClonedOI) {
1328 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1329 } else
1330 // RelativeToEntryFreq doesn't make sense when we have more than one
1331 // outlined call because each call will have a different relative frequency
1332 // to the entry block. We can consider using the average, but the
1333 // usefulness of that information is questionable. For now, assume we never
1334 // execute the calls to outlined functions.
1335 RelativeToEntryFreq = BranchProbability(0, 1);
1336
1337 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1338
1339 // The call sequence(s) to the outlined function(s) are larger than the sum of
1340 // the original outlined region size(s), it does not increase the chances of
1341 // inlining the function with outlining (The inliner uses the size increase to
1342 // model the cost of inlining a callee).
1343 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1344 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1345 DebugLoc DLoc;
1346 BasicBlock *Block;
1347 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1348 OrigFuncORE.emit([&]() {
1349 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1350 DLoc, Block)
1351 << ore::NV("Function", Cloner.OrigFunc)
1352 << " not partially inlined into callers (Original Size = "
1353 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1354 << ", Size of call sequence to outlined function = "
1355 << ore::NV("NewSize", SizeCost) << ")";
1356 });
1357 return false;
1358 }
1359
1360 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
1361 "F's users should all be replaced!");
1362
1363 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1364 Cloner.ClonedFunc->user_end());
1365
1366 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1367 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1368 if (CalleeEntryCount)
1369 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1370
1371 uint64_t CalleeEntryCountV =
1372 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1373
1374 bool AnyInline = false;
1375 for (User *User : Users) {
1376 CallSite CS = getCallSite(User);
1377
1378 if (IsLimitReached())
1379 continue;
1380
1381 OptimizationRemarkEmitter CallerORE(CS.getCaller());
1382 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1383 continue;
1384
1385 // Construct remark before doing the inlining, as after successful inlining
1386 // the callsite is removed.
1387 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1388 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1389 << ore::NV("Caller", CS.getCaller());
1390
1391 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1392 // We can only forward varargs when we outlined a single region, else we
1393 // bail on vararg functions.
1394 if (!InlineFunction(CS, IFI, nullptr, true,
1395 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1396 : nullptr)))
1397 continue;
1398
1399 CallerORE.emit(OR);
1400
1401 // Now update the entry count:
1402 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1403 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1404 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1405 }
1406
1407 AnyInline = true;
1408 NumPartialInlining++;
1409 // Update the stats
1410 if (Cloner.ClonedOI)
1411 NumPartialInlined++;
1412 else
1413 NumColdOutlinePartialInlined++;
1414
1415 }
1416
1417 if (AnyInline) {
1418 Cloner.IsFunctionInlined = true;
1419 if (CalleeEntryCount)
1420 Cloner.OrigFunc->setEntryCount(
1421 CalleeEntryCount.setCount(CalleeEntryCountV));
1422 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1423 OrigFuncORE.emit([&]() {
1424 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1425 << "Partially inlined into at least one caller";
1426 });
1427
1428 }
1429
1430 return AnyInline;
1431 }
1432
run(Module & M)1433 bool PartialInlinerImpl::run(Module &M) {
1434 if (DisablePartialInlining)
1435 return false;
1436
1437 std::vector<Function *> Worklist;
1438 Worklist.reserve(M.size());
1439 for (Function &F : M)
1440 if (!F.use_empty() && !F.isDeclaration())
1441 Worklist.push_back(&F);
1442
1443 bool Changed = false;
1444 while (!Worklist.empty()) {
1445 Function *CurrFunc = Worklist.back();
1446 Worklist.pop_back();
1447
1448 if (CurrFunc->use_empty())
1449 continue;
1450
1451 bool Recursive = false;
1452 for (User *U : CurrFunc->users())
1453 if (Instruction *I = dyn_cast<Instruction>(U))
1454 if (I->getParent()->getParent() == CurrFunc) {
1455 Recursive = true;
1456 break;
1457 }
1458 if (Recursive)
1459 continue;
1460
1461 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1462 if (Result.second)
1463 Worklist.push_back(Result.second);
1464 if (Result.first) {
1465 Changed = true;
1466 }
1467 }
1468
1469 return Changed;
1470 }
1471
1472 char PartialInlinerLegacyPass::ID = 0;
1473
1474 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1475 "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1476 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1477 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1478 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1479 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1480 "Partial Inliner", false, false)
1481
1482 ModulePass *llvm::createPartialInliningPass() {
1483 return new PartialInlinerLegacyPass();
1484 }
1485
run(Module & M,ModuleAnalysisManager & AM)1486 PreservedAnalyses PartialInlinerPass::run(Module &M,
1487 ModuleAnalysisManager &AM) {
1488 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1489
1490 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1491 [&FAM](Function &F) -> AssumptionCache & {
1492 return FAM.getResult<AssumptionAnalysis>(F);
1493 };
1494
1495 std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1496 [&FAM](Function &F) -> BlockFrequencyInfo & {
1497 return FAM.getResult<BlockFrequencyAnalysis>(F);
1498 };
1499
1500 std::function<TargetTransformInfo &(Function &)> GetTTI =
1501 [&FAM](Function &F) -> TargetTransformInfo & {
1502 return FAM.getResult<TargetIRAnalysis>(F);
1503 };
1504
1505 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1506
1507 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI)
1508 .run(M))
1509 return PreservedAnalyses::none();
1510 return PreservedAnalyses::all();
1511 }
1512