1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/BlockFrequencyInfo.h"
24 #include "llvm/Analysis/BranchProbabilityInfo.h"
25 #include "llvm/Analysis/InlineCost.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
28 #include "llvm/Analysis/ProfileSummaryInfo.h"
29 #include "llvm/Analysis/TargetLibraryInfo.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/IR/Attributes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/CallSite.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/InstrTypes.h"
40 #include "llvm/IR/Instruction.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/IntrinsicInst.h"
43 #include "llvm/IR/Intrinsics.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/BlockFrequency.h"
48 #include "llvm/Support/BranchProbability.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/Utils/Cloning.h"
54 #include "llvm/Transforms/Utils/CodeExtractor.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <functional>
60 #include <iterator>
61 #include <memory>
62 #include <tuple>
63 #include <vector>
64 
65 using namespace llvm;
66 
67 #define DEBUG_TYPE "partial-inlining"
68 
69 STATISTIC(NumPartialInlined,
70           "Number of callsites functions partially inlined into.");
71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72                                         "cold outlined regions were partially "
73                                         "inlined into its caller(s).");
74 STATISTIC(NumColdRegionsFound,
75            "Number of cold single entry/exit regions found.");
76 STATISTIC(NumColdRegionsOutlined,
77            "Number of cold single entry/exit regions outlined.");
78 
79 // Command line option to disable partial-inlining. The default is false:
80 static cl::opt<bool>
81     DisablePartialInlining("disable-partial-inlining", cl::init(false),
82                            cl::Hidden, cl::desc("Disable partial inlining"));
83 // Command line option to disable multi-region partial-inlining. The default is
84 // false:
85 static cl::opt<bool> DisableMultiRegionPartialInline(
86     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87     cl::desc("Disable multi-region partial inlining"));
88 
89 // Command line option to force outlining in regions with live exit variables.
90 // The default is false:
91 static cl::opt<bool>
92     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93                cl::desc("Force outline regions with live exits"));
94 
95 // Command line option to enable marking outline functions with Cold Calling
96 // Convention. The default is false:
97 static cl::opt<bool>
98     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99                        cl::desc("Mark outline function calls with ColdCC"));
100 
101 #ifndef NDEBUG
102 // Command line option to debug partial-inlining. The default is none:
103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
104                                           cl::init(false), cl::Hidden,
105                                           cl::desc("Trace partial inlining."));
106 #endif
107 
108 // This is an option used by testing:
109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
110                                       cl::init(false), cl::ZeroOrMore,
111                                       cl::ReallyHidden,
112                                       cl::desc("Skip Cost Analysis"));
113 // Used to determine if a cold region is worth outlining based on
114 // its inlining cost compared to the original function.  Default is set at 10%.
115 // ie. if the cold region reduces the inlining cost of the original function by
116 // at least 10%.
117 static cl::opt<float> MinRegionSizeRatio(
118     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
119     cl::desc("Minimum ratio comparing relative sizes of each "
120              "outline candidate and original function"));
121 // Used to tune the minimum number of execution counts needed in the predecessor
122 // block to the cold edge. ie. confidence interval.
123 static cl::opt<unsigned>
124     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
125                              cl::desc("Minimum block executions to consider "
126                                       "its BranchProbabilityInfo valid"));
127 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
128 // if the branch probability is 10% or less, then it is deemed as 'cold'.
129 static cl::opt<float> ColdBranchRatio(
130     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
131     cl::desc("Minimum BranchProbability to consider a region cold."));
132 
133 static cl::opt<unsigned> MaxNumInlineBlocks(
134     "max-num-inline-blocks", cl::init(5), cl::Hidden,
135     cl::desc("Max number of blocks to be partially inlined"));
136 
137 // Command line option to set the maximum number of partial inlining allowed
138 // for the module. The default value of -1 means no limit.
139 static cl::opt<int> MaxNumPartialInlining(
140     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
141     cl::desc("Max number of partial inlining. The default is unlimited"));
142 
143 // Used only when PGO or user annotated branch data is absent. It is
144 // the least value that is used to weigh the outline region. If BFI
145 // produces larger value, the BFI value will be used.
146 static cl::opt<int>
147     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
148                              cl::Hidden, cl::ZeroOrMore,
149                              cl::desc("Relative frequency of outline region to "
150                                       "the entry block"));
151 
152 static cl::opt<unsigned> ExtraOutliningPenalty(
153     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
154     cl::desc("A debug option to add additional penalty to the computed one."));
155 
156 namespace {
157 
158 struct FunctionOutliningInfo {
159   FunctionOutliningInfo() = default;
160 
161   // Returns the number of blocks to be inlined including all blocks
162   // in Entries and one return block.
GetNumInlinedBlocks__anond0d623640111::FunctionOutliningInfo163   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
164 
165   // A set of blocks including the function entry that guard
166   // the region to be outlined.
167   SmallVector<BasicBlock *, 4> Entries;
168 
169   // The return block that is not included in the outlined region.
170   BasicBlock *ReturnBlock = nullptr;
171 
172   // The dominating block of the region to be outlined.
173   BasicBlock *NonReturnBlock = nullptr;
174 
175   // The set of blocks in Entries that that are predecessors to ReturnBlock
176   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
177 };
178 
179 struct FunctionOutliningMultiRegionInfo {
FunctionOutliningMultiRegionInfo__anond0d623640111::FunctionOutliningMultiRegionInfo180   FunctionOutliningMultiRegionInfo()
181       : ORI() {}
182 
183   // Container for outline regions
184   struct OutlineRegionInfo {
OutlineRegionInfo__anond0d623640111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo185     OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region,
186                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
187                       BasicBlock *ReturnBlock)
188         : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
189           ReturnBlock(ReturnBlock) {}
190     SmallVector<BasicBlock *, 8> Region;
191     BasicBlock *EntryBlock;
192     BasicBlock *ExitBlock;
193     BasicBlock *ReturnBlock;
194   };
195 
196   SmallVector<OutlineRegionInfo, 4> ORI;
197 };
198 
199 struct PartialInlinerImpl {
200 
PartialInlinerImpl__anond0d623640111::PartialInlinerImpl201   PartialInlinerImpl(
202       std::function<AssumptionCache &(Function &)> *GetAC,
203       std::function<TargetTransformInfo &(Function &)> *GTTI,
204       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
205       ProfileSummaryInfo *ProfSI)
206       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
207 
208   bool run(Module &M);
209   // Main part of the transformation that calls helper functions to find
210   // outlining candidates, clone & outline the function, and attempt to
211   // partially inline the resulting function. Returns true if
212   // inlining was successful, false otherwise.  Also returns the outline
213   // function (only if we partially inlined early returns) as there is a
214   // possibility to further "peel" early return statements that were left in the
215   // outline function due to code size.
216   std::pair<bool, Function *> unswitchFunction(Function *F);
217 
218   // This class speculatively clones the function to be partial inlined.
219   // At the end of partial inlining, the remaining callsites to the cloned
220   // function that are not partially inlined will be fixed up to reference
221   // the original function, and the cloned function will be erased.
222   struct FunctionCloner {
223     // Two constructors, one for single region outlining, the other for
224     // multi-region outlining.
225     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
226                    OptimizationRemarkEmitter &ORE);
227     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
228                    OptimizationRemarkEmitter &ORE);
229     ~FunctionCloner();
230 
231     // Prepare for function outlining: making sure there is only
232     // one incoming edge from the extracted/outlined region to
233     // the return block.
234     void NormalizeReturnBlock();
235 
236     // Do function outlining for cold regions.
237     bool doMultiRegionFunctionOutlining();
238     // Do function outlining for region after early return block(s).
239     // NOTE: For vararg functions that do the vararg handling in the outlined
240     //       function, we temporarily generate IR that does not properly
241     //       forward varargs to the outlined function. Calling InlineFunction
242     //       will update calls to the outlined functions to properly forward
243     //       the varargs.
244     Function *doSingleRegionFunctionOutlining();
245 
246     Function *OrigFunc = nullptr;
247     Function *ClonedFunc = nullptr;
248 
249     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250     // Keep track of Outlined Functions and the basic block they're called from.
251     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
252 
253     // ClonedFunc is inlined in one of its callers after function
254     // outlining.
255     bool IsFunctionInlined = false;
256     // The cost of the region to be outlined.
257     int OutlinedRegionCost = 0;
258     // ClonedOI is specific to outlining non-early return blocks.
259     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
260     // ClonedOMRI is specific to outlining cold regions.
261     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
262     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
263     OptimizationRemarkEmitter &ORE;
264   };
265 
266 private:
267   int NumPartialInlining = 0;
268   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
269   std::function<TargetTransformInfo &(Function &)> *GetTTI;
270   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
271   ProfileSummaryInfo *PSI;
272 
273   // Return the frequency of the OutlininingBB relative to F's entry point.
274   // The result is no larger than 1 and is represented using BP.
275   // (Note that the outlined region's 'head' block can only have incoming
276   // edges from the guarding entry blocks).
277   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
278 
279   // Return true if the callee of CS should be partially inlined with
280   // profit.
281   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
282                            BlockFrequency WeightedOutliningRcost,
283                            OptimizationRemarkEmitter &ORE);
284 
285   // Try to inline DuplicateFunction (cloned from F with call to
286   // the OutlinedFunction into its callers. Return true
287   // if there is any successful inlining.
288   bool tryPartialInline(FunctionCloner &Cloner);
289 
290   // Compute the mapping from use site of DuplicationFunction to the enclosing
291   // BB's profile count.
292   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
293                                      DenseMap<User *, uint64_t> &SiteCountMap);
294 
IsLimitReached__anond0d623640111::PartialInlinerImpl295   bool IsLimitReached() {
296     return (MaxNumPartialInlining != -1 &&
297             NumPartialInlining >= MaxNumPartialInlining);
298   }
299 
getCallSite__anond0d623640111::PartialInlinerImpl300   static CallSite getCallSite(User *U) {
301     CallSite CS;
302     if (CallInst *CI = dyn_cast<CallInst>(U))
303       CS = CallSite(CI);
304     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
305       CS = CallSite(II);
306     else
307       llvm_unreachable("All uses must be calls");
308     return CS;
309   }
310 
getOneCallSiteTo__anond0d623640111::PartialInlinerImpl311   static CallSite getOneCallSiteTo(Function *F) {
312     User *User = *F->user_begin();
313     return getCallSite(User);
314   }
315 
getOneDebugLoc__anond0d623640111::PartialInlinerImpl316   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
317     CallSite CS = getOneCallSiteTo(F);
318     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
319     BasicBlock *Block = CS.getParent();
320     return std::make_tuple(DLoc, Block);
321   }
322 
323   // Returns the costs associated with function outlining:
324   // - The first value is the non-weighted runtime cost for making the call
325   //   to the outlined function, including the addtional  setup cost in the
326   //    outlined function itself;
327   // - The second value is the estimated size of the new call sequence in
328   //   basic block Cloner.OutliningCallBB;
329   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
330 
331   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332   // approximate both the size and runtime cost (Note that in the current
333   // inline cost analysis, there is no clear distinction there either).
334   static int computeBBInlineCost(BasicBlock *BB);
335 
336   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
337   std::unique_ptr<FunctionOutliningMultiRegionInfo>
338   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
339 };
340 
341 struct PartialInlinerLegacyPass : public ModulePass {
342   static char ID; // Pass identification, replacement for typeid
343 
PartialInlinerLegacyPass__anond0d623640111::PartialInlinerLegacyPass344   PartialInlinerLegacyPass() : ModulePass(ID) {
345     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
346   }
347 
getAnalysisUsage__anond0d623640111::PartialInlinerLegacyPass348   void getAnalysisUsage(AnalysisUsage &AU) const override {
349     AU.addRequired<AssumptionCacheTracker>();
350     AU.addRequired<ProfileSummaryInfoWrapperPass>();
351     AU.addRequired<TargetTransformInfoWrapperPass>();
352   }
353 
runOnModule__anond0d623640111::PartialInlinerLegacyPass354   bool runOnModule(Module &M) override {
355     if (skipModule(M))
356       return false;
357 
358     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
359     TargetTransformInfoWrapperPass *TTIWP =
360         &getAnalysis<TargetTransformInfoWrapperPass>();
361     ProfileSummaryInfo *PSI =
362         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
363 
364     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
365         [&ACT](Function &F) -> AssumptionCache & {
366       return ACT->getAssumptionCache(F);
367     };
368 
369     std::function<TargetTransformInfo &(Function &)> GetTTI =
370         [&TTIWP](Function &F) -> TargetTransformInfo & {
371       return TTIWP->getTTI(F);
372     };
373 
374     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI)
375         .run(M);
376   }
377 };
378 
379 } // end anonymous namespace
380 
381 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function * F,OptimizationRemarkEmitter & ORE)382 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
383                                                     OptimizationRemarkEmitter &ORE) {
384   BasicBlock *EntryBlock = &F->front();
385 
386   DominatorTree DT(*F);
387   LoopInfo LI(DT);
388   BranchProbabilityInfo BPI(*F, LI);
389   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
390   BlockFrequencyInfo *BFI;
391   if (!GetBFI) {
392     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
393     BFI = ScopedBFI.get();
394   } else
395     BFI = &(*GetBFI)(*F);
396 
397   // Return if we don't have profiling information.
398   if (!PSI->hasInstrumentationProfile())
399     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
400 
401   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
402       llvm::make_unique<FunctionOutliningMultiRegionInfo>();
403 
404   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
405     BasicBlock *Dom = BlockList.front();
406     return BlockList.size() > 1 && pred_size(Dom) == 1;
407   };
408 
409   auto IsSingleExit =
410       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
411     BasicBlock *ExitBlock = nullptr;
412     for (auto *Block : BlockList) {
413       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
414         if (!is_contained(BlockList, *SI)) {
415           if (ExitBlock) {
416             ORE.emit([&]() {
417               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
418                                               &SI->front())
419                      << "Region dominated by "
420                      << ore::NV("Block", BlockList.front()->getName())
421                      << " has more than one region exit edge.";
422             });
423             return nullptr;
424           } else
425             ExitBlock = Block;
426         }
427       }
428     }
429     return ExitBlock;
430   };
431 
432   auto BBProfileCount = [BFI](BasicBlock *BB) {
433     return BFI->getBlockProfileCount(BB)
434                ? BFI->getBlockProfileCount(BB).getValue()
435                : 0;
436   };
437 
438   // Use the same computeBBInlineCost function to compute the cost savings of
439   // the outlining the candidate region.
440   int OverallFunctionCost = 0;
441   for (auto &BB : *F)
442     OverallFunctionCost += computeBBInlineCost(&BB);
443 
444 #ifndef NDEBUG
445   if (TracePartialInlining)
446     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
447 #endif
448   int MinOutlineRegionCost =
449       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
450   BranchProbability MinBranchProbability(
451       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
452       MinBlockCounterExecution);
453   bool ColdCandidateFound = false;
454   BasicBlock *CurrEntry = EntryBlock;
455   std::vector<BasicBlock *> DFS;
456   DenseMap<BasicBlock *, bool> VisitedMap;
457   DFS.push_back(CurrEntry);
458   VisitedMap[CurrEntry] = true;
459   // Use Depth First Search on the basic blocks to find CFG edges that are
460   // considered cold.
461   // Cold regions considered must also have its inline cost compared to the
462   // overall inline cost of the original function.  The region is outlined only
463   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
464   // more.
465   while (!DFS.empty()) {
466     auto *thisBB = DFS.back();
467     DFS.pop_back();
468     // Only consider regions with predecessor blocks that are considered
469     // not-cold (default: part of the top 99.99% of all block counters)
470     // AND greater than our minimum block execution count (default: 100).
471     if (PSI->isColdBB(thisBB, BFI) ||
472         BBProfileCount(thisBB) < MinBlockCounterExecution)
473       continue;
474     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
475       if (VisitedMap[*SI])
476         continue;
477       VisitedMap[*SI] = true;
478       DFS.push_back(*SI);
479       // If branch isn't cold, we skip to the next one.
480       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
481       if (SuccProb > MinBranchProbability)
482         continue;
483 #ifndef NDEBUG
484       if (TracePartialInlining) {
485         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
486                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
487                << "\n";
488       }
489 #endif
490       SmallVector<BasicBlock *, 8> DominateVector;
491       DT.getDescendants(*SI, DominateVector);
492       // We can only outline single entry regions (for now).
493       if (!IsSingleEntry(DominateVector))
494         continue;
495       BasicBlock *ExitBlock = nullptr;
496       // We can only outline single exit regions (for now).
497       if (!(ExitBlock = IsSingleExit(DominateVector)))
498         continue;
499       int OutlineRegionCost = 0;
500       for (auto *BB : DominateVector)
501         OutlineRegionCost += computeBBInlineCost(BB);
502 
503 #ifndef NDEBUG
504       if (TracePartialInlining)
505         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
506 #endif
507 
508       if (OutlineRegionCost < MinOutlineRegionCost) {
509         ORE.emit([&]() {
510           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
511                                             &SI->front())
512                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
513                  << ore::NV("Cost", MinOutlineRegionCost);
514         });
515         continue;
516       }
517       // For now, ignore blocks that belong to a SISE region that is a
518       // candidate for outlining.  In the future, we may want to look
519       // at inner regions because the outer region may have live-exit
520       // variables.
521       for (auto *BB : DominateVector)
522         VisitedMap[BB] = true;
523       // ReturnBlock here means the block after the outline call
524       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
525       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
526       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
527           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
528       RegInfo.Region = DominateVector;
529       OutliningInfo->ORI.push_back(RegInfo);
530 #ifndef NDEBUG
531       if (TracePartialInlining) {
532         dbgs() << "Found Cold Candidate starting at block: "
533                << DominateVector.front()->getName() << "\n";
534       }
535 #endif
536       ColdCandidateFound = true;
537       NumColdRegionsFound++;
538     }
539   }
540   if (ColdCandidateFound)
541     return OutliningInfo;
542   else
543     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
544 }
545 
546 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function * F)547 PartialInlinerImpl::computeOutliningInfo(Function *F) {
548   BasicBlock *EntryBlock = &F->front();
549   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
550   if (!BR || BR->isUnconditional())
551     return std::unique_ptr<FunctionOutliningInfo>();
552 
553   // Returns true if Succ is BB's successor
554   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
555     return is_contained(successors(BB), Succ);
556   };
557 
558   auto IsReturnBlock = [](BasicBlock *BB) {
559     TerminatorInst *TI = BB->getTerminator();
560     return isa<ReturnInst>(TI);
561   };
562 
563   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
564     if (IsReturnBlock(Succ1))
565       return std::make_tuple(Succ1, Succ2);
566     if (IsReturnBlock(Succ2))
567       return std::make_tuple(Succ2, Succ1);
568 
569     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
570   };
571 
572   // Detect a triangular shape:
573   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574     if (IsSuccessor(Succ1, Succ2))
575       return std::make_tuple(Succ1, Succ2);
576     if (IsSuccessor(Succ2, Succ1))
577       return std::make_tuple(Succ2, Succ1);
578 
579     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580   };
581 
582   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
583       llvm::make_unique<FunctionOutliningInfo>();
584 
585   BasicBlock *CurrEntry = EntryBlock;
586   bool CandidateFound = false;
587   do {
588     // The number of blocks to be inlined has already reached
589     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
590     // disables partial inlining for the function.
591     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
592       break;
593 
594     if (succ_size(CurrEntry) != 2)
595       break;
596 
597     BasicBlock *Succ1 = *succ_begin(CurrEntry);
598     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
599 
600     BasicBlock *ReturnBlock, *NonReturnBlock;
601     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
602 
603     if (ReturnBlock) {
604       OutliningInfo->Entries.push_back(CurrEntry);
605       OutliningInfo->ReturnBlock = ReturnBlock;
606       OutliningInfo->NonReturnBlock = NonReturnBlock;
607       CandidateFound = true;
608       break;
609     }
610 
611     BasicBlock *CommSucc;
612     BasicBlock *OtherSucc;
613     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
614 
615     if (!CommSucc)
616       break;
617 
618     OutliningInfo->Entries.push_back(CurrEntry);
619     CurrEntry = OtherSucc;
620   } while (true);
621 
622   if (!CandidateFound)
623     return std::unique_ptr<FunctionOutliningInfo>();
624 
625   // Do sanity check of the entries: threre should not
626   // be any successors (not in the entry set) other than
627   // {ReturnBlock, NonReturnBlock}
628   assert(OutliningInfo->Entries[0] == &F->front() &&
629          "Function Entry must be the first in Entries vector");
630   DenseSet<BasicBlock *> Entries;
631   for (BasicBlock *E : OutliningInfo->Entries)
632     Entries.insert(E);
633 
634   // Returns true of BB has Predecessor which is not
635   // in Entries set.
636   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
637     for (auto Pred : predecessors(BB)) {
638       if (!Entries.count(Pred))
639         return true;
640     }
641     return false;
642   };
643   auto CheckAndNormalizeCandidate =
644       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
645         for (BasicBlock *E : OutliningInfo->Entries) {
646           for (auto Succ : successors(E)) {
647             if (Entries.count(Succ))
648               continue;
649             if (Succ == OutliningInfo->ReturnBlock)
650               OutliningInfo->ReturnBlockPreds.push_back(E);
651             else if (Succ != OutliningInfo->NonReturnBlock)
652               return false;
653           }
654           // There should not be any outside incoming edges either:
655           if (HasNonEntryPred(E))
656             return false;
657         }
658         return true;
659       };
660 
661   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
662     return std::unique_ptr<FunctionOutliningInfo>();
663 
664   // Now further growing the candidate's inlining region by
665   // peeling off dominating blocks from the outlining region:
666   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
667     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
668     if (succ_size(Cand) != 2)
669       break;
670 
671     if (HasNonEntryPred(Cand))
672       break;
673 
674     BasicBlock *Succ1 = *succ_begin(Cand);
675     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
676 
677     BasicBlock *ReturnBlock, *NonReturnBlock;
678     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
679     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
680       break;
681 
682     if (NonReturnBlock->getSinglePredecessor() != Cand)
683       break;
684 
685     // Now grow and update OutlininigInfo:
686     OutliningInfo->Entries.push_back(Cand);
687     OutliningInfo->NonReturnBlock = NonReturnBlock;
688     OutliningInfo->ReturnBlockPreds.push_back(Cand);
689     Entries.insert(Cand);
690   }
691 
692   return OutliningInfo;
693 }
694 
695 // Check if there is PGO data or user annoated branch data:
hasProfileData(Function * F,FunctionOutliningInfo * OI)696 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
697   if (F->hasProfileData())
698     return true;
699   // Now check if any of the entry block has MD_prof data:
700   for (auto *E : OI->Entries) {
701     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
702     if (!BR || BR->isUnconditional())
703       continue;
704     uint64_t T, F;
705     if (BR->extractProfMetadata(T, F))
706       return true;
707   }
708   return false;
709 }
710 
711 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner)712 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
713   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
714   auto EntryFreq =
715       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
716   auto OutliningCallFreq =
717       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
718   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
719   // we outlined any regions, so we may encounter situations where the
720   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
721   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
722     OutliningCallFreq = EntryFreq;
723   }
724   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
725       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
726 
727   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
728     return OutlineRegionRelFreq;
729 
730   // When profile data is not available, we need to be conservative in
731   // estimating the overall savings. Static branch prediction can usually
732   // guess the branch direction right (taken/non-taken), but the guessed
733   // branch probability is usually not biased enough. In case when the
734   // outlined region is predicted to be likely, its probability needs
735   // to be made higher (more biased) to not under-estimate the cost of
736   // function outlining. On the other hand, if the outlined region
737   // is predicted to be less likely, the predicted probablity is usually
738   // higher than the actual. For instance, the actual probability of the
739   // less likely target is only 5%, but the guessed probablity can be
740   // 40%. In the latter case, there is no need for further adjustement.
741   // FIXME: add an option for this.
742   if (OutlineRegionRelFreq < BranchProbability(45, 100))
743     return OutlineRegionRelFreq;
744 
745   OutlineRegionRelFreq = std::max(
746       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
747 
748   return OutlineRegionRelFreq;
749 }
750 
shouldPartialInline(CallSite CS,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE)751 bool PartialInlinerImpl::shouldPartialInline(
752     CallSite CS, FunctionCloner &Cloner,
753     BlockFrequency WeightedOutliningRcost,
754     OptimizationRemarkEmitter &ORE) {
755   using namespace ore;
756 
757   Instruction *Call = CS.getInstruction();
758   Function *Callee = CS.getCalledFunction();
759   assert(Callee == Cloner.ClonedFunc);
760 
761   if (SkipCostAnalysis)
762     return isInlineViable(*Callee);
763 
764   Function *Caller = CS.getCaller();
765   auto &CalleeTTI = (*GetTTI)(*Callee);
766   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
767                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
768 
769   if (IC.isAlways()) {
770     ORE.emit([&]() {
771       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
772              << NV("Callee", Cloner.OrigFunc)
773              << " should always be fully inlined, not partially";
774     });
775     return false;
776   }
777 
778   if (IC.isNever()) {
779     ORE.emit([&]() {
780       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
781              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
782              << NV("Caller", Caller)
783              << " because it should never be inlined (cost=never)";
784     });
785     return false;
786   }
787 
788   if (!IC) {
789     ORE.emit([&]() {
790       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
791              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
792              << NV("Caller", Caller) << " because too costly to inline (cost="
793              << NV("Cost", IC.getCost()) << ", threshold="
794              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
795     });
796     return false;
797   }
798   const DataLayout &DL = Caller->getParent()->getDataLayout();
799 
800   // The savings of eliminating the call:
801   int NonWeightedSavings = getCallsiteCost(CS, DL);
802   BlockFrequency NormWeightedSavings(NonWeightedSavings);
803 
804   // Weighted saving is smaller than weighted cost, return false
805   if (NormWeightedSavings < WeightedOutliningRcost) {
806     ORE.emit([&]() {
807       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
808                                         Call)
809              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
810              << NV("Caller", Caller) << " runtime overhead (overhead="
811              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
812              << ", savings="
813              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
814              << ")"
815              << " of making the outlined call is too high";
816     });
817 
818     return false;
819   }
820 
821   ORE.emit([&]() {
822     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
823            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
824            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
825            << " (threshold="
826            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
827   });
828   return true;
829 }
830 
831 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
832 // For now use a simplified version. The returned 'InlineCost' will be used
833 // to esimate the size cost as well as runtime cost of the BB.
computeBBInlineCost(BasicBlock * BB)834 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
835   int InlineCost = 0;
836   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
837   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
838     if (isa<DbgInfoIntrinsic>(I))
839       continue;
840 
841     switch (I->getOpcode()) {
842     case Instruction::BitCast:
843     case Instruction::PtrToInt:
844     case Instruction::IntToPtr:
845     case Instruction::Alloca:
846       continue;
847     case Instruction::GetElementPtr:
848       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
849         continue;
850       break;
851     default:
852       break;
853     }
854 
855     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
856     if (IntrInst) {
857       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
858           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
859         continue;
860     }
861 
862     if (CallInst *CI = dyn_cast<CallInst>(I)) {
863       InlineCost += getCallsiteCost(CallSite(CI), DL);
864       continue;
865     }
866 
867     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
868       InlineCost += getCallsiteCost(CallSite(II), DL);
869       continue;
870     }
871 
872     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
873       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
874       continue;
875     }
876     InlineCost += InlineConstants::InstrCost;
877   }
878   return InlineCost;
879 }
880 
881 std::tuple<int, int>
computeOutliningCosts(FunctionCloner & Cloner)882 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
883   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
884   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
885     Function *OutlinedFunc = FuncBBPair.first;
886     BasicBlock* OutliningCallBB = FuncBBPair.second;
887     // Now compute the cost of the call sequence to the outlined function
888     // 'OutlinedFunction' in BB 'OutliningCallBB':
889     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
890 
891     // Now compute the cost of the extracted/outlined function itself:
892     for (BasicBlock &BB : *OutlinedFunc)
893       OutlinedFunctionCost += computeBBInlineCost(&BB);
894   }
895   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
896          "Outlined function cost should be no less than the outlined region");
897 
898   // The code extractor introduces a new root and exit stub blocks with
899   // additional unconditional branches. Those branches will be eliminated
900   // later with bb layout. The cost should be adjusted accordingly:
901   OutlinedFunctionCost -=
902       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
903 
904   int OutliningRuntimeOverhead =
905       OutliningFuncCallCost +
906       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
907       ExtraOutliningPenalty;
908 
909   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
910 }
911 
912 // Create the callsite to profile count map which is
913 // used to update the original function's entry count,
914 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap)915 void PartialInlinerImpl::computeCallsiteToProfCountMap(
916     Function *DuplicateFunction,
917     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
918   std::vector<User *> Users(DuplicateFunction->user_begin(),
919                             DuplicateFunction->user_end());
920   Function *CurrentCaller = nullptr;
921   std::unique_ptr<BlockFrequencyInfo> TempBFI;
922   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
923 
924   auto ComputeCurrBFI = [&,this](Function *Caller) {
925       // For the old pass manager:
926       if (!GetBFI) {
927         DominatorTree DT(*Caller);
928         LoopInfo LI(DT);
929         BranchProbabilityInfo BPI(*Caller, LI);
930         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
931         CurrentCallerBFI = TempBFI.get();
932       } else {
933         // New pass manager:
934         CurrentCallerBFI = &(*GetBFI)(*Caller);
935       }
936   };
937 
938   for (User *User : Users) {
939     CallSite CS = getCallSite(User);
940     Function *Caller = CS.getCaller();
941     if (CurrentCaller != Caller) {
942       CurrentCaller = Caller;
943       ComputeCurrBFI(Caller);
944     } else {
945       assert(CurrentCallerBFI && "CallerBFI is not set");
946     }
947     BasicBlock *CallBB = CS.getInstruction()->getParent();
948     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
949     if (Count)
950       CallSiteToProfCountMap[User] = *Count;
951     else
952       CallSiteToProfCountMap[User] = 0;
953   }
954 }
955 
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE)956 PartialInlinerImpl::FunctionCloner::FunctionCloner(
957     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE)
958     : OrigFunc(F), ORE(ORE) {
959   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
960 
961   // Clone the function, so that we can hack away on it.
962   ValueToValueMapTy VMap;
963   ClonedFunc = CloneFunction(F, VMap);
964 
965   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
966   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
967   for (BasicBlock *BB : OI->Entries) {
968     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
969   }
970   for (BasicBlock *E : OI->ReturnBlockPreds) {
971     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
972     ClonedOI->ReturnBlockPreds.push_back(NewE);
973   }
974   // Go ahead and update all uses to the duplicate, so that we can just
975   // use the inliner functionality when we're done hacking.
976   F->replaceAllUsesWith(ClonedFunc);
977 }
978 
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE)979 PartialInlinerImpl::FunctionCloner::FunctionCloner(
980     Function *F, FunctionOutliningMultiRegionInfo *OI,
981     OptimizationRemarkEmitter &ORE)
982     : OrigFunc(F), ORE(ORE) {
983   ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
984 
985   // Clone the function, so that we can hack away on it.
986   ValueToValueMapTy VMap;
987   ClonedFunc = CloneFunction(F, VMap);
988 
989   // Go through all Outline Candidate Regions and update all BasicBlock
990   // information.
991   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
992        OI->ORI) {
993     SmallVector<BasicBlock *, 8> Region;
994     for (BasicBlock *BB : RegionInfo.Region) {
995       Region.push_back(cast<BasicBlock>(VMap[BB]));
996     }
997     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
998     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
999     BasicBlock *NewReturnBlock = nullptr;
1000     if (RegionInfo.ReturnBlock)
1001       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1002     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1003         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1004     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1005   }
1006   // Go ahead and update all uses to the duplicate, so that we can just
1007   // use the inliner functionality when we're done hacking.
1008   F->replaceAllUsesWith(ClonedFunc);
1009 }
1010 
NormalizeReturnBlock()1011 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1012   auto getFirstPHI = [](BasicBlock *BB) {
1013     BasicBlock::iterator I = BB->begin();
1014     PHINode *FirstPhi = nullptr;
1015     while (I != BB->end()) {
1016       PHINode *Phi = dyn_cast<PHINode>(I);
1017       if (!Phi)
1018         break;
1019       if (!FirstPhi) {
1020         FirstPhi = Phi;
1021         break;
1022       }
1023     }
1024     return FirstPhi;
1025   };
1026 
1027   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1028   // blocks.
1029   if (!ClonedOI)
1030     return;
1031 
1032   // Special hackery is needed with PHI nodes that have inputs from more than
1033   // one extracted block.  For simplicity, just split the PHIs into a two-level
1034   // sequence of PHIs, some of which will go in the extracted region, and some
1035   // of which will go outside.
1036   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1037   // only split block when necessary:
1038   PHINode *FirstPhi = getFirstPHI(PreReturn);
1039   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1040 
1041   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1042     return;
1043 
1044   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1045     Value *CommonValue = PN->getIncomingValue(0);
1046     if (all_of(PN->incoming_values(),
1047                [&](Value *V) { return V == CommonValue; }))
1048       return CommonValue;
1049     return nullptr;
1050   };
1051 
1052   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1053       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1054   BasicBlock::iterator I = PreReturn->begin();
1055   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1056   SmallVector<Instruction *, 4> DeadPhis;
1057   while (I != PreReturn->end()) {
1058     PHINode *OldPhi = dyn_cast<PHINode>(I);
1059     if (!OldPhi)
1060       break;
1061 
1062     PHINode *RetPhi =
1063         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1064     OldPhi->replaceAllUsesWith(RetPhi);
1065     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1066 
1067     RetPhi->addIncoming(&*I, PreReturn);
1068     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1069       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1070       OldPhi->removeIncomingValue(E);
1071     }
1072 
1073     // After incoming values splitting, the old phi may become trivial.
1074     // Keeping the trivial phi can introduce definition inside the outline
1075     // region which is live-out, causing necessary overhead (load, store
1076     // arg passing etc).
1077     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1078       OldPhi->replaceAllUsesWith(OldPhiVal);
1079       DeadPhis.push_back(OldPhi);
1080     }
1081     ++I;
1082   }
1083   for (auto *DP : DeadPhis)
1084     DP->eraseFromParent();
1085 
1086   for (auto E : ClonedOI->ReturnBlockPreds) {
1087     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1088   }
1089 }
1090 
doMultiRegionFunctionOutlining()1091 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1092 
1093   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1094     int Cost = 0;
1095     for (BasicBlock* BB : Region)
1096       Cost += computeBBInlineCost(BB);
1097     return Cost;
1098   };
1099 
1100   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1101 
1102   if (ClonedOMRI->ORI.empty())
1103     return false;
1104 
1105   // The CodeExtractor needs a dominator tree.
1106   DominatorTree DT;
1107   DT.recalculate(*ClonedFunc);
1108 
1109   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1110   LoopInfo LI(DT);
1111   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1112   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1113 
1114   SetVector<Value *> Inputs, Outputs, Sinks;
1115   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1116        ClonedOMRI->ORI) {
1117     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1118 
1119     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1120                      ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false);
1121 
1122     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1123 
1124 #ifndef NDEBUG
1125     if (TracePartialInlining) {
1126       dbgs() << "inputs: " << Inputs.size() << "\n";
1127       dbgs() << "outputs: " << Outputs.size() << "\n";
1128       for (Value *value : Inputs)
1129         dbgs() << "value used in func: " << *value << "\n";
1130       for (Value *output : Outputs)
1131         dbgs() << "instr used in func: " << *output << "\n";
1132     }
1133 #endif
1134     // Do not extract regions that have live exit variables.
1135     if (Outputs.size() > 0 && !ForceLiveExit)
1136       continue;
1137 
1138     Function *OutlinedFunc = CE.extractCodeRegion();
1139 
1140     if (OutlinedFunc) {
1141       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1142       BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1143       assert(OutliningCallBB->getParent() == ClonedFunc);
1144       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1145       NumColdRegionsOutlined++;
1146       OutlinedRegionCost += CurrentOutlinedRegionCost;
1147 
1148       if (MarkOutlinedColdCC) {
1149         OutlinedFunc->setCallingConv(CallingConv::Cold);
1150         OCS.setCallingConv(CallingConv::Cold);
1151       }
1152     } else
1153       ORE.emit([&]() {
1154         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1155                                         &RegionInfo.Region.front()->front())
1156                << "Failed to extract region at block "
1157                << ore::NV("Block", RegionInfo.Region.front());
1158       });
1159   }
1160 
1161   return !OutlinedFunctions.empty();
1162 }
1163 
1164 Function *
doSingleRegionFunctionOutlining()1165 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1166   // Returns true if the block is to be partial inlined into the caller
1167   // (i.e. not to be extracted to the out of line function)
1168   auto ToBeInlined = [&, this](BasicBlock *BB) {
1169     return BB == ClonedOI->ReturnBlock ||
1170            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1171             ClonedOI->Entries.end());
1172   };
1173 
1174   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1175   // The CodeExtractor needs a dominator tree.
1176   DominatorTree DT;
1177   DT.recalculate(*ClonedFunc);
1178 
1179   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1180   LoopInfo LI(DT);
1181   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1182   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1183 
1184   // Gather up the blocks that we're going to extract.
1185   std::vector<BasicBlock *> ToExtract;
1186   ToExtract.push_back(ClonedOI->NonReturnBlock);
1187   OutlinedRegionCost +=
1188       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1189   for (BasicBlock &BB : *ClonedFunc)
1190     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1191       ToExtract.push_back(&BB);
1192       // FIXME: the code extractor may hoist/sink more code
1193       // into the outlined function which may make the outlining
1194       // overhead (the difference of the outlined function cost
1195       // and OutliningRegionCost) look larger.
1196       OutlinedRegionCost += computeBBInlineCost(&BB);
1197     }
1198 
1199   // Extract the body of the if.
1200   Function *OutlinedFunc =
1201       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1202                     ClonedFuncBFI.get(), &BPI,
1203                     /* AllowVarargs */ true)
1204           .extractCodeRegion();
1205 
1206   if (OutlinedFunc) {
1207     BasicBlock *OutliningCallBB =
1208         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1209             .getInstruction()
1210             ->getParent();
1211     assert(OutliningCallBB->getParent() == ClonedFunc);
1212     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1213   } else
1214     ORE.emit([&]() {
1215       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1216                                       &ToExtract.front()->front())
1217              << "Failed to extract region at block "
1218              << ore::NV("Block", ToExtract.front());
1219     });
1220 
1221   return OutlinedFunc;
1222 }
1223 
~FunctionCloner()1224 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1225   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1226   // users (function pointers, etc.) back to the original function.
1227   ClonedFunc->replaceAllUsesWith(OrigFunc);
1228   ClonedFunc->eraseFromParent();
1229   if (!IsFunctionInlined) {
1230     // Remove each function that was speculatively created if there is no
1231     // reference.
1232     for (auto FuncBBPair : OutlinedFunctions) {
1233       Function *Func = FuncBBPair.first;
1234       Func->eraseFromParent();
1235     }
1236   }
1237 }
1238 
unswitchFunction(Function * F)1239 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1240 
1241   if (F->hasAddressTaken())
1242     return {false, nullptr};
1243 
1244   // Let inliner handle it
1245   if (F->hasFnAttribute(Attribute::AlwaysInline))
1246     return {false, nullptr};
1247 
1248   if (F->hasFnAttribute(Attribute::NoInline))
1249     return {false, nullptr};
1250 
1251   if (PSI->isFunctionEntryCold(F))
1252     return {false, nullptr};
1253 
1254   if (F->user_begin() == F->user_end())
1255     return {false, nullptr};
1256 
1257   OptimizationRemarkEmitter ORE(F);
1258 
1259   // Only try to outline cold regions if we have a profile summary, which
1260   // implies we have profiling information.
1261   if (PSI->hasProfileSummary() && F->hasProfileData() &&
1262       !DisableMultiRegionPartialInline) {
1263     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1264         computeOutliningColdRegionsInfo(F, ORE);
1265     if (OMRI) {
1266       FunctionCloner Cloner(F, OMRI.get(), ORE);
1267 
1268 #ifndef NDEBUG
1269       if (TracePartialInlining) {
1270         dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1271         dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1272                << "\n";
1273       }
1274 #endif
1275       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1276 
1277       if (DidOutline) {
1278 #ifndef NDEBUG
1279         if (TracePartialInlining) {
1280           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1281           Cloner.ClonedFunc->print(dbgs());
1282           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1283         }
1284 #endif
1285 
1286         if (tryPartialInline(Cloner))
1287           return {true, nullptr};
1288       }
1289     }
1290   }
1291 
1292   // Fall-thru to regular partial inlining if we:
1293   //    i) can't find any cold regions to outline, or
1294   //   ii) can't inline the outlined function anywhere.
1295   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1296   if (!OI)
1297     return {false, nullptr};
1298 
1299   FunctionCloner Cloner(F, OI.get(), ORE);
1300   Cloner.NormalizeReturnBlock();
1301 
1302   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1303 
1304   if (!OutlinedFunction)
1305     return {false, nullptr};
1306 
1307   bool AnyInline = tryPartialInline(Cloner);
1308 
1309   if (AnyInline)
1310     return {true, OutlinedFunction};
1311 
1312   return {false, nullptr};
1313 }
1314 
tryPartialInline(FunctionCloner & Cloner)1315 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1316   if (Cloner.OutlinedFunctions.empty())
1317     return false;
1318 
1319   int SizeCost = 0;
1320   BlockFrequency WeightedRcost;
1321   int NonWeightedRcost;
1322   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1323 
1324   // Only calculate RelativeToEntryFreq when we are doing single region
1325   // outlining.
1326   BranchProbability RelativeToEntryFreq;
1327   if (Cloner.ClonedOI) {
1328     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1329   } else
1330     // RelativeToEntryFreq doesn't make sense when we have more than one
1331     // outlined call because each call will have a different relative frequency
1332     // to the entry block.  We can consider using the average, but the
1333     // usefulness of that information is questionable. For now, assume we never
1334     // execute the calls to outlined functions.
1335     RelativeToEntryFreq = BranchProbability(0, 1);
1336 
1337   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1338 
1339   // The call sequence(s) to the outlined function(s) are larger than the sum of
1340   // the original outlined region size(s), it does not increase the chances of
1341   // inlining the function with outlining (The inliner uses the size increase to
1342   // model the cost of inlining a callee).
1343   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1344     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1345     DebugLoc DLoc;
1346     BasicBlock *Block;
1347     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1348     OrigFuncORE.emit([&]() {
1349       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1350                                         DLoc, Block)
1351              << ore::NV("Function", Cloner.OrigFunc)
1352              << " not partially inlined into callers (Original Size = "
1353              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1354              << ", Size of call sequence to outlined function = "
1355              << ore::NV("NewSize", SizeCost) << ")";
1356     });
1357     return false;
1358   }
1359 
1360   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
1361          "F's users should all be replaced!");
1362 
1363   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1364                             Cloner.ClonedFunc->user_end());
1365 
1366   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1367   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1368   if (CalleeEntryCount)
1369     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1370 
1371   uint64_t CalleeEntryCountV =
1372       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1373 
1374   bool AnyInline = false;
1375   for (User *User : Users) {
1376     CallSite CS = getCallSite(User);
1377 
1378     if (IsLimitReached())
1379       continue;
1380 
1381     OptimizationRemarkEmitter CallerORE(CS.getCaller());
1382     if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1383       continue;
1384 
1385     // Construct remark before doing the inlining, as after successful inlining
1386     // the callsite is removed.
1387     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1388     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1389        << ore::NV("Caller", CS.getCaller());
1390 
1391     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1392     // We can only forward varargs when we outlined a single region, else we
1393     // bail on vararg functions.
1394     if (!InlineFunction(CS, IFI, nullptr, true,
1395                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1396                                          : nullptr)))
1397       continue;
1398 
1399     CallerORE.emit(OR);
1400 
1401     // Now update the entry count:
1402     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1403       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1404       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1405     }
1406 
1407     AnyInline = true;
1408     NumPartialInlining++;
1409     // Update the stats
1410     if (Cloner.ClonedOI)
1411       NumPartialInlined++;
1412     else
1413       NumColdOutlinePartialInlined++;
1414 
1415   }
1416 
1417   if (AnyInline) {
1418     Cloner.IsFunctionInlined = true;
1419     if (CalleeEntryCount)
1420       Cloner.OrigFunc->setEntryCount(
1421           CalleeEntryCount.setCount(CalleeEntryCountV));
1422     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1423     OrigFuncORE.emit([&]() {
1424       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1425              << "Partially inlined into at least one caller";
1426     });
1427 
1428   }
1429 
1430   return AnyInline;
1431 }
1432 
run(Module & M)1433 bool PartialInlinerImpl::run(Module &M) {
1434   if (DisablePartialInlining)
1435     return false;
1436 
1437   std::vector<Function *> Worklist;
1438   Worklist.reserve(M.size());
1439   for (Function &F : M)
1440     if (!F.use_empty() && !F.isDeclaration())
1441       Worklist.push_back(&F);
1442 
1443   bool Changed = false;
1444   while (!Worklist.empty()) {
1445     Function *CurrFunc = Worklist.back();
1446     Worklist.pop_back();
1447 
1448     if (CurrFunc->use_empty())
1449       continue;
1450 
1451     bool Recursive = false;
1452     for (User *U : CurrFunc->users())
1453       if (Instruction *I = dyn_cast<Instruction>(U))
1454         if (I->getParent()->getParent() == CurrFunc) {
1455           Recursive = true;
1456           break;
1457         }
1458     if (Recursive)
1459       continue;
1460 
1461     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1462     if (Result.second)
1463       Worklist.push_back(Result.second);
1464     if (Result.first) {
1465       Changed = true;
1466     }
1467   }
1468 
1469   return Changed;
1470 }
1471 
1472 char PartialInlinerLegacyPass::ID = 0;
1473 
1474 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1475                       "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1476 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1477 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1478 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1479 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1480                     "Partial Inliner", false, false)
1481 
1482 ModulePass *llvm::createPartialInliningPass() {
1483   return new PartialInlinerLegacyPass();
1484 }
1485 
run(Module & M,ModuleAnalysisManager & AM)1486 PreservedAnalyses PartialInlinerPass::run(Module &M,
1487                                           ModuleAnalysisManager &AM) {
1488   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1489 
1490   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1491       [&FAM](Function &F) -> AssumptionCache & {
1492     return FAM.getResult<AssumptionAnalysis>(F);
1493   };
1494 
1495   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1496       [&FAM](Function &F) -> BlockFrequencyInfo & {
1497     return FAM.getResult<BlockFrequencyAnalysis>(F);
1498   };
1499 
1500   std::function<TargetTransformInfo &(Function &)> GetTTI =
1501       [&FAM](Function &F) -> TargetTransformInfo & {
1502     return FAM.getResult<TargetIRAnalysis>(F);
1503   };
1504 
1505   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1506 
1507   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI)
1508           .run(M))
1509     return PreservedAnalyses::none();
1510   return PreservedAnalyses::all();
1511 }
1512