1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallSite.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "inline-cost"
44 
45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
46 
47 static cl::opt<int> InlineThreshold(
48     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
49     cl::desc("Control the amount of inlining to perform (default = 225)"));
50 
51 static cl::opt<int> HintThreshold(
52     "inlinehint-threshold", cl::Hidden, cl::init(325),
53     cl::desc("Threshold for inlining functions with inline hint"));
54 
55 static cl::opt<int>
56     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
57                           cl::init(45),
58                           cl::desc("Threshold for inlining cold callsites"));
59 
60 // We introduce this threshold to help performance of instrumentation based
61 // PGO before we actually hook up inliner with analysis passes such as BPI and
62 // BFI.
63 static cl::opt<int> ColdThreshold(
64     "inlinecold-threshold", cl::Hidden, cl::init(45),
65     cl::desc("Threshold for inlining functions with cold attribute"));
66 
67 static cl::opt<int>
68     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
69                          cl::ZeroOrMore,
70                          cl::desc("Threshold for hot callsites "));
71 
72 static cl::opt<int> LocallyHotCallSiteThreshold(
73     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
74     cl::desc("Threshold for locally hot callsites "));
75 
76 static cl::opt<int> ColdCallSiteRelFreq(
77     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
78     cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
79              "entry frequency, for a callsite to be cold in the absence of "
80              "profile information."));
81 
82 static cl::opt<int> HotCallSiteRelFreq(
83     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
84     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
85              "entry frequency, for a callsite to be hot in the absence of "
86              "profile information."));
87 
88 static cl::opt<bool> OptComputeFullInlineCost(
89     "inline-cost-full", cl::Hidden, cl::init(false),
90     cl::desc("Compute the full inline cost of a call site even when the cost "
91              "exceeds the threshold."));
92 
93 namespace {
94 
95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
96   typedef InstVisitor<CallAnalyzer, bool> Base;
97   friend class InstVisitor<CallAnalyzer, bool>;
98 
99   /// The TargetTransformInfo available for this compilation.
100   const TargetTransformInfo &TTI;
101 
102   /// Getter for the cache of @llvm.assume intrinsics.
103   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
104 
105   /// Getter for BlockFrequencyInfo
106   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
107 
108   /// Profile summary information.
109   ProfileSummaryInfo *PSI;
110 
111   /// The called function.
112   Function &F;
113 
114   // Cache the DataLayout since we use it a lot.
115   const DataLayout &DL;
116 
117   /// The OptimizationRemarkEmitter available for this compilation.
118   OptimizationRemarkEmitter *ORE;
119 
120   /// The candidate callsite being analyzed. Please do not use this to do
121   /// analysis in the caller function; we want the inline cost query to be
122   /// easily cacheable. Instead, use the cover function paramHasAttr.
123   CallSite CandidateCS;
124 
125   /// Tunable parameters that control the analysis.
126   const InlineParams &Params;
127 
128   int Threshold;
129   int Cost;
130   bool ComputeFullInlineCost;
131 
132   bool IsCallerRecursive;
133   bool IsRecursiveCall;
134   bool ExposesReturnsTwice;
135   bool HasDynamicAlloca;
136   bool ContainsNoDuplicateCall;
137   bool HasReturn;
138   bool HasIndirectBr;
139   bool HasUninlineableIntrinsic;
140   bool UsesVarArgs;
141 
142   /// Number of bytes allocated statically by the callee.
143   uint64_t AllocatedSize;
144   unsigned NumInstructions, NumVectorInstructions;
145   int VectorBonus, TenPercentVectorBonus;
146   // Bonus to be applied when the callee has only one reachable basic block.
147   int SingleBBBonus;
148 
149   /// While we walk the potentially-inlined instructions, we build up and
150   /// maintain a mapping of simplified values specific to this callsite. The
151   /// idea is to propagate any special information we have about arguments to
152   /// this call through the inlinable section of the function, and account for
153   /// likely simplifications post-inlining. The most important aspect we track
154   /// is CFG altering simplifications -- when we prove a basic block dead, that
155   /// can cause dramatic shifts in the cost of inlining a function.
156   DenseMap<Value *, Constant *> SimplifiedValues;
157 
158   /// Keep track of the values which map back (through function arguments) to
159   /// allocas on the caller stack which could be simplified through SROA.
160   DenseMap<Value *, Value *> SROAArgValues;
161 
162   /// The mapping of caller Alloca values to their accumulated cost savings. If
163   /// we have to disable SROA for one of the allocas, this tells us how much
164   /// cost must be added.
165   DenseMap<Value *, int> SROAArgCosts;
166 
167   /// Keep track of values which map to a pointer base and constant offset.
168   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
169 
170   /// Keep track of dead blocks due to the constant arguments.
171   SetVector<BasicBlock *> DeadBlocks;
172 
173   /// The mapping of the blocks to their known unique successors due to the
174   /// constant arguments.
175   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
176 
177   /// Model the elimination of repeated loads that is expected to happen
178   /// whenever we simplify away the stores that would otherwise cause them to be
179   /// loads.
180   bool EnableLoadElimination;
181   SmallPtrSet<Value *, 16> LoadAddrSet;
182   int LoadEliminationCost;
183 
184   // Custom simplification helper routines.
185   bool isAllocaDerivedArg(Value *V);
186   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
187                             DenseMap<Value *, int>::iterator &CostIt);
188   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
189   void disableSROA(Value *V);
190   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
191   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
192                           int InstructionCost);
193   void disableLoadElimination();
194   bool isGEPFree(GetElementPtrInst &GEP);
195   bool canFoldInboundsGEP(GetElementPtrInst &I);
196   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
197   bool simplifyCallSite(Function *F, CallSite CS);
198   template <typename Callable>
199   bool simplifyInstruction(Instruction &I, Callable Evaluate);
200   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
201 
202   /// Return true if the given argument to the function being considered for
203   /// inlining has the given attribute set either at the call site or the
204   /// function declaration.  Primarily used to inspect call site specific
205   /// attributes since these can be more precise than the ones on the callee
206   /// itself.
207   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
208 
209   /// Return true if the given value is known non null within the callee if
210   /// inlined through this particular callsite.
211   bool isKnownNonNullInCallee(Value *V);
212 
213   /// Update Threshold based on callsite properties such as callee
214   /// attributes and callee hotness for PGO builds. The Callee is explicitly
215   /// passed to support analyzing indirect calls whose target is inferred by
216   /// analysis.
217   void updateThreshold(CallSite CS, Function &Callee);
218 
219   /// Return true if size growth is allowed when inlining the callee at CS.
220   bool allowSizeGrowth(CallSite CS);
221 
222   /// Return true if \p CS is a cold callsite.
223   bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
224 
225   /// Return a higher threshold if \p CS is a hot callsite.
226   Optional<int> getHotCallSiteThreshold(CallSite CS,
227                                         BlockFrequencyInfo *CallerBFI);
228 
229   // Custom analysis routines.
230   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
231 
232   // Disable several entry points to the visitor so we don't accidentally use
233   // them by declaring but not defining them here.
234   void visit(Module *);
235   void visit(Module &);
236   void visit(Function *);
237   void visit(Function &);
238   void visit(BasicBlock *);
239   void visit(BasicBlock &);
240 
241   // Provide base case for our instruction visit.
242   bool visitInstruction(Instruction &I);
243 
244   // Our visit overrides.
245   bool visitAlloca(AllocaInst &I);
246   bool visitPHI(PHINode &I);
247   bool visitGetElementPtr(GetElementPtrInst &I);
248   bool visitBitCast(BitCastInst &I);
249   bool visitPtrToInt(PtrToIntInst &I);
250   bool visitIntToPtr(IntToPtrInst &I);
251   bool visitCastInst(CastInst &I);
252   bool visitUnaryInstruction(UnaryInstruction &I);
253   bool visitCmpInst(CmpInst &I);
254   bool visitSub(BinaryOperator &I);
255   bool visitBinaryOperator(BinaryOperator &I);
256   bool visitLoad(LoadInst &I);
257   bool visitStore(StoreInst &I);
258   bool visitExtractValue(ExtractValueInst &I);
259   bool visitInsertValue(InsertValueInst &I);
260   bool visitCallSite(CallSite CS);
261   bool visitReturnInst(ReturnInst &RI);
262   bool visitBranchInst(BranchInst &BI);
263   bool visitSelectInst(SelectInst &SI);
264   bool visitSwitchInst(SwitchInst &SI);
265   bool visitIndirectBrInst(IndirectBrInst &IBI);
266   bool visitResumeInst(ResumeInst &RI);
267   bool visitCleanupReturnInst(CleanupReturnInst &RI);
268   bool visitCatchReturnInst(CatchReturnInst &RI);
269   bool visitUnreachableInst(UnreachableInst &I);
270 
271 public:
CallAnalyzer(const TargetTransformInfo & TTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> & GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallSite CSArg,const InlineParams & Params)272   CallAnalyzer(const TargetTransformInfo &TTI,
273                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
274                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
275                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
276                Function &Callee, CallSite CSArg, const InlineParams &Params)
277       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
278         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
279         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
280         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
281                                        Params.ComputeFullInlineCost || ORE),
282         IsCallerRecursive(false), IsRecursiveCall(false),
283         ExposesReturnsTwice(false), HasDynamicAlloca(false),
284         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
285         HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0),
286         NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
287         SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
288         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
289         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
290         NumInstructionsSimplified(0), SROACostSavings(0),
291         SROACostSavingsLost(0) {}
292 
293   bool analyzeCall(CallSite CS);
294 
getThreshold()295   int getThreshold() { return Threshold; }
getCost()296   int getCost() { return Cost; }
297 
298   // Keep a bunch of stats about the cost savings found so we can print them
299   // out when debugging.
300   unsigned NumConstantArgs;
301   unsigned NumConstantOffsetPtrArgs;
302   unsigned NumAllocaArgs;
303   unsigned NumConstantPtrCmps;
304   unsigned NumConstantPtrDiffs;
305   unsigned NumInstructionsSimplified;
306   unsigned SROACostSavings;
307   unsigned SROACostSavingsLost;
308 
309   void dump();
310 };
311 
312 } // namespace
313 
314 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)315 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
316   return SROAArgValues.count(V);
317 }
318 
319 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
320 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)321 bool CallAnalyzer::lookupSROAArgAndCost(
322     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
323   if (SROAArgValues.empty() || SROAArgCosts.empty())
324     return false;
325 
326   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
327   if (ArgIt == SROAArgValues.end())
328     return false;
329 
330   Arg = ArgIt->second;
331   CostIt = SROAArgCosts.find(Arg);
332   return CostIt != SROAArgCosts.end();
333 }
334 
335 /// Disable SROA for the candidate marked by this cost iterator.
336 ///
337 /// This marks the candidate as no longer viable for SROA, and adds the cost
338 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)339 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
340   // If we're no longer able to perform SROA we need to undo its cost savings
341   // and prevent subsequent analysis.
342   Cost += CostIt->second;
343   SROACostSavings -= CostIt->second;
344   SROACostSavingsLost += CostIt->second;
345   SROAArgCosts.erase(CostIt);
346   disableLoadElimination();
347 }
348 
349 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)350 void CallAnalyzer::disableSROA(Value *V) {
351   Value *SROAArg;
352   DenseMap<Value *, int>::iterator CostIt;
353   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
354     disableSROA(CostIt);
355 }
356 
357 /// Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)358 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
359                                       int InstructionCost) {
360   CostIt->second += InstructionCost;
361   SROACostSavings += InstructionCost;
362 }
363 
disableLoadElimination()364 void CallAnalyzer::disableLoadElimination() {
365   if (EnableLoadElimination) {
366     Cost += LoadEliminationCost;
367     LoadEliminationCost = 0;
368     EnableLoadElimination = false;
369   }
370 }
371 
372 /// Accumulate a constant GEP offset into an APInt if possible.
373 ///
374 /// Returns false if unable to compute the offset for any reason. Respects any
375 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)376 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
377   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
378   assert(IntPtrWidth == Offset.getBitWidth());
379 
380   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
381        GTI != GTE; ++GTI) {
382     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
383     if (!OpC)
384       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
385         OpC = dyn_cast<ConstantInt>(SimpleOp);
386     if (!OpC)
387       return false;
388     if (OpC->isZero())
389       continue;
390 
391     // Handle a struct index, which adds its field offset to the pointer.
392     if (StructType *STy = GTI.getStructTypeOrNull()) {
393       unsigned ElementIdx = OpC->getZExtValue();
394       const StructLayout *SL = DL.getStructLayout(STy);
395       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
396       continue;
397     }
398 
399     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
400     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
401   }
402   return true;
403 }
404 
405 /// Use TTI to check whether a GEP is free.
406 ///
407 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)408 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
409   SmallVector<Value *, 4> Operands;
410   Operands.push_back(GEP.getOperand(0));
411   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
412     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
413        Operands.push_back(SimpleOp);
414      else
415        Operands.push_back(*I);
416   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
417 }
418 
visitAlloca(AllocaInst & I)419 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
420   // Check whether inlining will turn a dynamic alloca into a static
421   // alloca and handle that case.
422   if (I.isArrayAllocation()) {
423     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
424     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
425       Type *Ty = I.getAllocatedType();
426       AllocatedSize = SaturatingMultiplyAdd(
427           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
428       return Base::visitAlloca(I);
429     }
430   }
431 
432   // Accumulate the allocated size.
433   if (I.isStaticAlloca()) {
434     Type *Ty = I.getAllocatedType();
435     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
436   }
437 
438   // We will happily inline static alloca instructions.
439   if (I.isStaticAlloca())
440     return Base::visitAlloca(I);
441 
442   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
443   // a variety of reasons, and so we would like to not inline them into
444   // functions which don't currently have a dynamic alloca. This simply
445   // disables inlining altogether in the presence of a dynamic alloca.
446   HasDynamicAlloca = true;
447   return false;
448 }
449 
visitPHI(PHINode & I)450 bool CallAnalyzer::visitPHI(PHINode &I) {
451   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
452   // though we don't want to propagate it's bonuses. The idea is to disable
453   // SROA if it *might* be used in an inappropriate manner.
454 
455   // Phi nodes are always zero-cost.
456   // FIXME: Pointer sizes may differ between different address spaces, so do we
457   // need to use correct address space in the call to getPointerSizeInBits here?
458   // Or could we skip the getPointerSizeInBits call completely? As far as I can
459   // see the ZeroOffset is used as a dummy value, so we can probably use any
460   // bit width for the ZeroOffset?
461   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
462   bool CheckSROA = I.getType()->isPointerTy();
463 
464   // Track the constant or pointer with constant offset we've seen so far.
465   Constant *FirstC = nullptr;
466   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
467   Value *FirstV = nullptr;
468 
469   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
470     BasicBlock *Pred = I.getIncomingBlock(i);
471     // If the incoming block is dead, skip the incoming block.
472     if (DeadBlocks.count(Pred))
473       continue;
474     // If the parent block of phi is not the known successor of the incoming
475     // block, skip the incoming block.
476     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
477     if (KnownSuccessor && KnownSuccessor != I.getParent())
478       continue;
479 
480     Value *V = I.getIncomingValue(i);
481     // If the incoming value is this phi itself, skip the incoming value.
482     if (&I == V)
483       continue;
484 
485     Constant *C = dyn_cast<Constant>(V);
486     if (!C)
487       C = SimplifiedValues.lookup(V);
488 
489     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
490     if (!C && CheckSROA)
491       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
492 
493     if (!C && !BaseAndOffset.first)
494       // The incoming value is neither a constant nor a pointer with constant
495       // offset, exit early.
496       return true;
497 
498     if (FirstC) {
499       if (FirstC == C)
500         // If we've seen a constant incoming value before and it is the same
501         // constant we see this time, continue checking the next incoming value.
502         continue;
503       // Otherwise early exit because we either see a different constant or saw
504       // a constant before but we have a pointer with constant offset this time.
505       return true;
506     }
507 
508     if (FirstV) {
509       // The same logic as above, but check pointer with constant offset here.
510       if (FirstBaseAndOffset == BaseAndOffset)
511         continue;
512       return true;
513     }
514 
515     if (C) {
516       // This is the 1st time we've seen a constant, record it.
517       FirstC = C;
518       continue;
519     }
520 
521     // The remaining case is that this is the 1st time we've seen a pointer with
522     // constant offset, record it.
523     FirstV = V;
524     FirstBaseAndOffset = BaseAndOffset;
525   }
526 
527   // Check if we can map phi to a constant.
528   if (FirstC) {
529     SimplifiedValues[&I] = FirstC;
530     return true;
531   }
532 
533   // Check if we can map phi to a pointer with constant offset.
534   if (FirstBaseAndOffset.first) {
535     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
536 
537     Value *SROAArg;
538     DenseMap<Value *, int>::iterator CostIt;
539     if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
540       SROAArgValues[&I] = SROAArg;
541   }
542 
543   return true;
544 }
545 
546 /// Check we can fold GEPs of constant-offset call site argument pointers.
547 /// This requires target data and inbounds GEPs.
548 ///
549 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)550 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
551   // Check if we have a base + offset for the pointer.
552   std::pair<Value *, APInt> BaseAndOffset =
553       ConstantOffsetPtrs.lookup(I.getPointerOperand());
554   if (!BaseAndOffset.first)
555     return false;
556 
557   // Check if the offset of this GEP is constant, and if so accumulate it
558   // into Offset.
559   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
560     return false;
561 
562   // Add the result as a new mapping to Base + Offset.
563   ConstantOffsetPtrs[&I] = BaseAndOffset;
564 
565   return true;
566 }
567 
visitGetElementPtr(GetElementPtrInst & I)568 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
569   Value *SROAArg;
570   DenseMap<Value *, int>::iterator CostIt;
571   bool SROACandidate =
572       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
573 
574   // Lambda to check whether a GEP's indices are all constant.
575   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
576     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
577       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
578         return false;
579     return true;
580   };
581 
582   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
583     if (SROACandidate)
584       SROAArgValues[&I] = SROAArg;
585 
586     // Constant GEPs are modeled as free.
587     return true;
588   }
589 
590   // Variable GEPs will require math and will disable SROA.
591   if (SROACandidate)
592     disableSROA(CostIt);
593   return isGEPFree(I);
594 }
595 
596 /// Simplify \p I if its operands are constants and update SimplifiedValues.
597 /// \p Evaluate is a callable specific to instruction type that evaluates the
598 /// instruction when all the operands are constants.
599 template <typename Callable>
simplifyInstruction(Instruction & I,Callable Evaluate)600 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
601   SmallVector<Constant *, 2> COps;
602   for (Value *Op : I.operands()) {
603     Constant *COp = dyn_cast<Constant>(Op);
604     if (!COp)
605       COp = SimplifiedValues.lookup(Op);
606     if (!COp)
607       return false;
608     COps.push_back(COp);
609   }
610   auto *C = Evaluate(COps);
611   if (!C)
612     return false;
613   SimplifiedValues[&I] = C;
614   return true;
615 }
616 
visitBitCast(BitCastInst & I)617 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
618   // Propagate constants through bitcasts.
619   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
620         return ConstantExpr::getBitCast(COps[0], I.getType());
621       }))
622     return true;
623 
624   // Track base/offsets through casts
625   std::pair<Value *, APInt> BaseAndOffset =
626       ConstantOffsetPtrs.lookup(I.getOperand(0));
627   // Casts don't change the offset, just wrap it up.
628   if (BaseAndOffset.first)
629     ConstantOffsetPtrs[&I] = BaseAndOffset;
630 
631   // Also look for SROA candidates here.
632   Value *SROAArg;
633   DenseMap<Value *, int>::iterator CostIt;
634   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
635     SROAArgValues[&I] = SROAArg;
636 
637   // Bitcasts are always zero cost.
638   return true;
639 }
640 
visitPtrToInt(PtrToIntInst & I)641 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
642   // Propagate constants through ptrtoint.
643   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
644         return ConstantExpr::getPtrToInt(COps[0], I.getType());
645       }))
646     return true;
647 
648   // Track base/offset pairs when converted to a plain integer provided the
649   // integer is large enough to represent the pointer.
650   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
651   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
652   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
653     std::pair<Value *, APInt> BaseAndOffset =
654         ConstantOffsetPtrs.lookup(I.getOperand(0));
655     if (BaseAndOffset.first)
656       ConstantOffsetPtrs[&I] = BaseAndOffset;
657   }
658 
659   // This is really weird. Technically, ptrtoint will disable SROA. However,
660   // unless that ptrtoint is *used* somewhere in the live basic blocks after
661   // inlining, it will be nuked, and SROA should proceed. All of the uses which
662   // would block SROA would also block SROA if applied directly to a pointer,
663   // and so we can just add the integer in here. The only places where SROA is
664   // preserved either cannot fire on an integer, or won't in-and-of themselves
665   // disable SROA (ext) w/o some later use that we would see and disable.
666   Value *SROAArg;
667   DenseMap<Value *, int>::iterator CostIt;
668   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
669     SROAArgValues[&I] = SROAArg;
670 
671   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
672 }
673 
visitIntToPtr(IntToPtrInst & I)674 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
675   // Propagate constants through ptrtoint.
676   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
677         return ConstantExpr::getIntToPtr(COps[0], I.getType());
678       }))
679     return true;
680 
681   // Track base/offset pairs when round-tripped through a pointer without
682   // modifications provided the integer is not too large.
683   Value *Op = I.getOperand(0);
684   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
685   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
686     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
687     if (BaseAndOffset.first)
688       ConstantOffsetPtrs[&I] = BaseAndOffset;
689   }
690 
691   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
692   Value *SROAArg;
693   DenseMap<Value *, int>::iterator CostIt;
694   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
695     SROAArgValues[&I] = SROAArg;
696 
697   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
698 }
699 
visitCastInst(CastInst & I)700 bool CallAnalyzer::visitCastInst(CastInst &I) {
701   // Propagate constants through ptrtoint.
702   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
703         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
704       }))
705     return true;
706 
707   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
708   disableSROA(I.getOperand(0));
709 
710   // If this is a floating-point cast, and the target says this operation
711   // is expensive, this may eventually become a library call. Treat the cost
712   // as such.
713   switch (I.getOpcode()) {
714   case Instruction::FPTrunc:
715   case Instruction::FPExt:
716   case Instruction::UIToFP:
717   case Instruction::SIToFP:
718   case Instruction::FPToUI:
719   case Instruction::FPToSI:
720     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
721       Cost += InlineConstants::CallPenalty;
722     break;
723   default:
724     break;
725   }
726 
727   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
728 }
729 
visitUnaryInstruction(UnaryInstruction & I)730 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
731   Value *Operand = I.getOperand(0);
732   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
733         return ConstantFoldInstOperands(&I, COps[0], DL);
734       }))
735     return true;
736 
737   // Disable any SROA on the argument to arbitrary unary operators.
738   disableSROA(Operand);
739 
740   return false;
741 }
742 
paramHasAttr(Argument * A,Attribute::AttrKind Attr)743 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
744   return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
745 }
746 
isKnownNonNullInCallee(Value * V)747 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
748   // Does the *call site* have the NonNull attribute set on an argument?  We
749   // use the attribute on the call site to memoize any analysis done in the
750   // caller. This will also trip if the callee function has a non-null
751   // parameter attribute, but that's a less interesting case because hopefully
752   // the callee would already have been simplified based on that.
753   if (Argument *A = dyn_cast<Argument>(V))
754     if (paramHasAttr(A, Attribute::NonNull))
755       return true;
756 
757   // Is this an alloca in the caller?  This is distinct from the attribute case
758   // above because attributes aren't updated within the inliner itself and we
759   // always want to catch the alloca derived case.
760   if (isAllocaDerivedArg(V))
761     // We can actually predict the result of comparisons between an
762     // alloca-derived value and null. Note that this fires regardless of
763     // SROA firing.
764     return true;
765 
766   return false;
767 }
768 
allowSizeGrowth(CallSite CS)769 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
770   // If the normal destination of the invoke or the parent block of the call
771   // site is unreachable-terminated, there is little point in inlining this
772   // unless there is literally zero cost.
773   // FIXME: Note that it is possible that an unreachable-terminated block has a
774   // hot entry. For example, in below scenario inlining hot_call_X() may be
775   // beneficial :
776   // main() {
777   //   hot_call_1();
778   //   ...
779   //   hot_call_N()
780   //   exit(0);
781   // }
782   // For now, we are not handling this corner case here as it is rare in real
783   // code. In future, we should elaborate this based on BPI and BFI in more
784   // general threshold adjusting heuristics in updateThreshold().
785   Instruction *Instr = CS.getInstruction();
786   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
787     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
788       return false;
789   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
790     return false;
791 
792   return true;
793 }
794 
isColdCallSite(CallSite CS,BlockFrequencyInfo * CallerBFI)795 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
796   // If global profile summary is available, then callsite's coldness is
797   // determined based on that.
798   if (PSI && PSI->hasProfileSummary())
799     return PSI->isColdCallSite(CS, CallerBFI);
800 
801   // Otherwise we need BFI to be available.
802   if (!CallerBFI)
803     return false;
804 
805   // Determine if the callsite is cold relative to caller's entry. We could
806   // potentially cache the computation of scaled entry frequency, but the added
807   // complexity is not worth it unless this scaling shows up high in the
808   // profiles.
809   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
810   auto CallSiteBB = CS.getInstruction()->getParent();
811   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
812   auto CallerEntryFreq =
813       CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
814   return CallSiteFreq < CallerEntryFreq * ColdProb;
815 }
816 
817 Optional<int>
getHotCallSiteThreshold(CallSite CS,BlockFrequencyInfo * CallerBFI)818 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
819                                       BlockFrequencyInfo *CallerBFI) {
820 
821   // If global profile summary is available, then callsite's hotness is
822   // determined based on that.
823   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
824     return Params.HotCallSiteThreshold;
825 
826   // Otherwise we need BFI to be available and to have a locally hot callsite
827   // threshold.
828   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
829     return None;
830 
831   // Determine if the callsite is hot relative to caller's entry. We could
832   // potentially cache the computation of scaled entry frequency, but the added
833   // complexity is not worth it unless this scaling shows up high in the
834   // profiles.
835   auto CallSiteBB = CS.getInstruction()->getParent();
836   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
837   auto CallerEntryFreq = CallerBFI->getEntryFreq();
838   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
839     return Params.LocallyHotCallSiteThreshold;
840 
841   // Otherwise treat it normally.
842   return None;
843 }
844 
updateThreshold(CallSite CS,Function & Callee)845 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
846   // If no size growth is allowed for this inlining, set Threshold to 0.
847   if (!allowSizeGrowth(CS)) {
848     Threshold = 0;
849     return;
850   }
851 
852   Function *Caller = CS.getCaller();
853 
854   // return min(A, B) if B is valid.
855   auto MinIfValid = [](int A, Optional<int> B) {
856     return B ? std::min(A, B.getValue()) : A;
857   };
858 
859   // return max(A, B) if B is valid.
860   auto MaxIfValid = [](int A, Optional<int> B) {
861     return B ? std::max(A, B.getValue()) : A;
862   };
863 
864   // Various bonus percentages. These are multiplied by Threshold to get the
865   // bonus values.
866   // SingleBBBonus: This bonus is applied if the callee has a single reachable
867   // basic block at the given callsite context. This is speculatively applied
868   // and withdrawn if more than one basic block is seen.
869   //
870   // Vector bonuses: We want to more aggressively inline vector-dense kernels
871   // and apply this bonus based on the percentage of vector instructions. A
872   // bonus is applied if the vector instructions exceed 50% and half that amount
873   // is applied if it exceeds 10%. Note that these bonuses are some what
874   // arbitrary and evolved over time by accident as much as because they are
875   // principled bonuses.
876   // FIXME: It would be nice to base the bonus values on something more
877   // scientific.
878   //
879   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
880   // of the last call to a static function as inlining such functions is
881   // guaranteed to reduce code size.
882   //
883   // These bonus percentages may be set to 0 based on properties of the caller
884   // and the callsite.
885   int SingleBBBonusPercent = 50;
886   int VectorBonusPercent = 150;
887   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
888 
889   // Lambda to set all the above bonus and bonus percentages to 0.
890   auto DisallowAllBonuses = [&]() {
891     SingleBBBonusPercent = 0;
892     VectorBonusPercent = 0;
893     LastCallToStaticBonus = 0;
894   };
895 
896   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
897   // and reduce the threshold if the caller has the necessary attribute.
898   if (Caller->optForMinSize()) {
899     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
900     // For minsize, we want to disable the single BB bonus and the vector
901     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
902     // a static function will, at the minimum, eliminate the parameter setup and
903     // call/return instructions.
904     SingleBBBonusPercent = 0;
905     VectorBonusPercent = 0;
906   } else if (Caller->optForSize())
907     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
908 
909   // Adjust the threshold based on inlinehint attribute and profile based
910   // hotness information if the caller does not have MinSize attribute.
911   if (!Caller->optForMinSize()) {
912     if (Callee.hasFnAttribute(Attribute::InlineHint))
913       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
914 
915     // FIXME: After switching to the new passmanager, simplify the logic below
916     // by checking only the callsite hotness/coldness as we will reliably
917     // have local profile information.
918     //
919     // Callsite hotness and coldness can be determined if sample profile is
920     // used (which adds hotness metadata to calls) or if caller's
921     // BlockFrequencyInfo is available.
922     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
923     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
924     if (!Caller->optForSize() && HotCallSiteThreshold) {
925       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
926       // FIXME: This should update the threshold only if it exceeds the
927       // current threshold, but AutoFDO + ThinLTO currently relies on this
928       // behavior to prevent inlining of hot callsites during ThinLTO
929       // compile phase.
930       Threshold = HotCallSiteThreshold.getValue();
931     } else if (isColdCallSite(CS, CallerBFI)) {
932       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
933       // Do not apply bonuses for a cold callsite including the
934       // LastCallToStatic bonus. While this bonus might result in code size
935       // reduction, it can cause the size of a non-cold caller to increase
936       // preventing it from being inlined.
937       DisallowAllBonuses();
938       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
939     } else if (PSI) {
940       // Use callee's global profile information only if we have no way of
941       // determining this via callsite information.
942       if (PSI->isFunctionEntryHot(&Callee)) {
943         LLVM_DEBUG(dbgs() << "Hot callee.\n");
944         // If callsite hotness can not be determined, we may still know
945         // that the callee is hot and treat it as a weaker hint for threshold
946         // increase.
947         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
948       } else if (PSI->isFunctionEntryCold(&Callee)) {
949         LLVM_DEBUG(dbgs() << "Cold callee.\n");
950         // Do not apply bonuses for a cold callee including the
951         // LastCallToStatic bonus. While this bonus might result in code size
952         // reduction, it can cause the size of a non-cold caller to increase
953         // preventing it from being inlined.
954         DisallowAllBonuses();
955         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
956       }
957     }
958   }
959 
960   // Finally, take the target-specific inlining threshold multiplier into
961   // account.
962   Threshold *= TTI.getInliningThresholdMultiplier();
963 
964   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
965   VectorBonus = Threshold * VectorBonusPercent / 100;
966 
967   bool OnlyOneCallAndLocalLinkage =
968       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
969   // If there is only one call of the function, and it has internal linkage,
970   // the cost of inlining it drops dramatically. It may seem odd to update
971   // Cost in updateThreshold, but the bonus depends on the logic in this method.
972   if (OnlyOneCallAndLocalLinkage)
973     Cost -= LastCallToStaticBonus;
974 }
975 
visitCmpInst(CmpInst & I)976 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
977   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
978   // First try to handle simplified comparisons.
979   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
980         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
981       }))
982     return true;
983 
984   if (I.getOpcode() == Instruction::FCmp)
985     return false;
986 
987   // Otherwise look for a comparison between constant offset pointers with
988   // a common base.
989   Value *LHSBase, *RHSBase;
990   APInt LHSOffset, RHSOffset;
991   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
992   if (LHSBase) {
993     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
994     if (RHSBase && LHSBase == RHSBase) {
995       // We have common bases, fold the icmp to a constant based on the
996       // offsets.
997       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
998       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
999       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1000         SimplifiedValues[&I] = C;
1001         ++NumConstantPtrCmps;
1002         return true;
1003       }
1004     }
1005   }
1006 
1007   // If the comparison is an equality comparison with null, we can simplify it
1008   // if we know the value (argument) can't be null
1009   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1010       isKnownNonNullInCallee(I.getOperand(0))) {
1011     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1012     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1013                                       : ConstantInt::getFalse(I.getType());
1014     return true;
1015   }
1016   // Finally check for SROA candidates in comparisons.
1017   Value *SROAArg;
1018   DenseMap<Value *, int>::iterator CostIt;
1019   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1020     if (isa<ConstantPointerNull>(I.getOperand(1))) {
1021       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1022       return true;
1023     }
1024 
1025     disableSROA(CostIt);
1026   }
1027 
1028   return false;
1029 }
1030 
visitSub(BinaryOperator & I)1031 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1032   // Try to handle a special case: we can fold computing the difference of two
1033   // constant-related pointers.
1034   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1035   Value *LHSBase, *RHSBase;
1036   APInt LHSOffset, RHSOffset;
1037   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1038   if (LHSBase) {
1039     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1040     if (RHSBase && LHSBase == RHSBase) {
1041       // We have common bases, fold the subtract to a constant based on the
1042       // offsets.
1043       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1044       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1045       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1046         SimplifiedValues[&I] = C;
1047         ++NumConstantPtrDiffs;
1048         return true;
1049       }
1050     }
1051   }
1052 
1053   // Otherwise, fall back to the generic logic for simplifying and handling
1054   // instructions.
1055   return Base::visitSub(I);
1056 }
1057 
visitBinaryOperator(BinaryOperator & I)1058 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1059   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1060   Constant *CLHS = dyn_cast<Constant>(LHS);
1061   if (!CLHS)
1062     CLHS = SimplifiedValues.lookup(LHS);
1063   Constant *CRHS = dyn_cast<Constant>(RHS);
1064   if (!CRHS)
1065     CRHS = SimplifiedValues.lookup(RHS);
1066 
1067   Value *SimpleV = nullptr;
1068   if (auto FI = dyn_cast<FPMathOperator>(&I))
1069     SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1070                               CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1071   else
1072     SimpleV =
1073         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1074 
1075   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1076     SimplifiedValues[&I] = C;
1077 
1078   if (SimpleV)
1079     return true;
1080 
1081   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1082   disableSROA(LHS);
1083   disableSROA(RHS);
1084 
1085   // If the instruction is floating point, and the target says this operation
1086   // is expensive, this may eventually become a library call. Treat the cost
1087   // as such.
1088   if (I.getType()->isFloatingPointTy() &&
1089       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1090     Cost += InlineConstants::CallPenalty;
1091 
1092   return false;
1093 }
1094 
visitLoad(LoadInst & I)1095 bool CallAnalyzer::visitLoad(LoadInst &I) {
1096   Value *SROAArg;
1097   DenseMap<Value *, int>::iterator CostIt;
1098   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1099     if (I.isSimple()) {
1100       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1101       return true;
1102     }
1103 
1104     disableSROA(CostIt);
1105   }
1106 
1107   // If the data is already loaded from this address and hasn't been clobbered
1108   // by any stores or calls, this load is likely to be redundant and can be
1109   // eliminated.
1110   if (EnableLoadElimination &&
1111       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1112     LoadEliminationCost += InlineConstants::InstrCost;
1113     return true;
1114   }
1115 
1116   return false;
1117 }
1118 
visitStore(StoreInst & I)1119 bool CallAnalyzer::visitStore(StoreInst &I) {
1120   Value *SROAArg;
1121   DenseMap<Value *, int>::iterator CostIt;
1122   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1123     if (I.isSimple()) {
1124       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1125       return true;
1126     }
1127 
1128     disableSROA(CostIt);
1129   }
1130 
1131   // The store can potentially clobber loads and prevent repeated loads from
1132   // being eliminated.
1133   // FIXME:
1134   // 1. We can probably keep an initial set of eliminatable loads substracted
1135   // from the cost even when we finally see a store. We just need to disable
1136   // *further* accumulation of elimination savings.
1137   // 2. We should probably at some point thread MemorySSA for the callee into
1138   // this and then use that to actually compute *really* precise savings.
1139   disableLoadElimination();
1140   return false;
1141 }
1142 
visitExtractValue(ExtractValueInst & I)1143 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1144   // Constant folding for extract value is trivial.
1145   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1146         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1147       }))
1148     return true;
1149 
1150   // SROA can look through these but give them a cost.
1151   return false;
1152 }
1153 
visitInsertValue(InsertValueInst & I)1154 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1155   // Constant folding for insert value is trivial.
1156   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1157         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1158                                             /*InsertedValueOperand*/ COps[1],
1159                                             I.getIndices());
1160       }))
1161     return true;
1162 
1163   // SROA can look through these but give them a cost.
1164   return false;
1165 }
1166 
1167 /// Try to simplify a call site.
1168 ///
1169 /// Takes a concrete function and callsite and tries to actually simplify it by
1170 /// analyzing the arguments and call itself with instsimplify. Returns true if
1171 /// it has simplified the callsite to some other entity (a constant), making it
1172 /// free.
simplifyCallSite(Function * F,CallSite CS)1173 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1174   // FIXME: Using the instsimplify logic directly for this is inefficient
1175   // because we have to continually rebuild the argument list even when no
1176   // simplifications can be performed. Until that is fixed with remapping
1177   // inside of instsimplify, directly constant fold calls here.
1178   if (!canConstantFoldCallTo(CS, F))
1179     return false;
1180 
1181   // Try to re-map the arguments to constants.
1182   SmallVector<Constant *, 4> ConstantArgs;
1183   ConstantArgs.reserve(CS.arg_size());
1184   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1185        ++I) {
1186     Constant *C = dyn_cast<Constant>(*I);
1187     if (!C)
1188       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1189     if (!C)
1190       return false; // This argument doesn't map to a constant.
1191 
1192     ConstantArgs.push_back(C);
1193   }
1194   if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1195     SimplifiedValues[CS.getInstruction()] = C;
1196     return true;
1197   }
1198 
1199   return false;
1200 }
1201 
visitCallSite(CallSite CS)1202 bool CallAnalyzer::visitCallSite(CallSite CS) {
1203   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1204       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1205     // This aborts the entire analysis.
1206     ExposesReturnsTwice = true;
1207     return false;
1208   }
1209   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1210     ContainsNoDuplicateCall = true;
1211 
1212   if (Function *F = CS.getCalledFunction()) {
1213     // When we have a concrete function, first try to simplify it directly.
1214     if (simplifyCallSite(F, CS))
1215       return true;
1216 
1217     // Next check if it is an intrinsic we know about.
1218     // FIXME: Lift this into part of the InstVisitor.
1219     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1220       switch (II->getIntrinsicID()) {
1221       default:
1222         if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1223           disableLoadElimination();
1224         return Base::visitCallSite(CS);
1225 
1226       case Intrinsic::load_relative:
1227         // This is normally lowered to 4 LLVM instructions.
1228         Cost += 3 * InlineConstants::InstrCost;
1229         return false;
1230 
1231       case Intrinsic::memset:
1232       case Intrinsic::memcpy:
1233       case Intrinsic::memmove:
1234         disableLoadElimination();
1235         // SROA can usually chew through these intrinsics, but they aren't free.
1236         return false;
1237       case Intrinsic::icall_branch_funnel:
1238       case Intrinsic::localescape:
1239         HasUninlineableIntrinsic = true;
1240         return false;
1241       case Intrinsic::vastart:
1242       case Intrinsic::vaend:
1243         UsesVarArgs = true;
1244         return false;
1245       }
1246     }
1247 
1248     if (F == CS.getInstruction()->getFunction()) {
1249       // This flag will fully abort the analysis, so don't bother with anything
1250       // else.
1251       IsRecursiveCall = true;
1252       return false;
1253     }
1254 
1255     if (TTI.isLoweredToCall(F)) {
1256       // We account for the average 1 instruction per call argument setup
1257       // here.
1258       Cost += CS.arg_size() * InlineConstants::InstrCost;
1259 
1260       // Everything other than inline ASM will also have a significant cost
1261       // merely from making the call.
1262       if (!isa<InlineAsm>(CS.getCalledValue()))
1263         Cost += InlineConstants::CallPenalty;
1264     }
1265 
1266     if (!CS.onlyReadsMemory())
1267       disableLoadElimination();
1268     return Base::visitCallSite(CS);
1269   }
1270 
1271   // Otherwise we're in a very special case -- an indirect function call. See
1272   // if we can be particularly clever about this.
1273   Value *Callee = CS.getCalledValue();
1274 
1275   // First, pay the price of the argument setup. We account for the average
1276   // 1 instruction per call argument setup here.
1277   Cost += CS.arg_size() * InlineConstants::InstrCost;
1278 
1279   // Next, check if this happens to be an indirect function call to a known
1280   // function in this inline context. If not, we've done all we can.
1281   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1282   if (!F) {
1283     if (!CS.onlyReadsMemory())
1284       disableLoadElimination();
1285     return Base::visitCallSite(CS);
1286   }
1287 
1288   // If we have a constant that we are calling as a function, we can peer
1289   // through it and see the function target. This happens not infrequently
1290   // during devirtualization and so we want to give it a hefty bonus for
1291   // inlining, but cap that bonus in the event that inlining wouldn't pan
1292   // out. Pretend to inline the function, with a custom threshold.
1293   auto IndirectCallParams = Params;
1294   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1295   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1296                   IndirectCallParams);
1297   if (CA.analyzeCall(CS)) {
1298     // We were able to inline the indirect call! Subtract the cost from the
1299     // threshold to get the bonus we want to apply, but don't go below zero.
1300     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1301   }
1302 
1303   if (!F->onlyReadsMemory())
1304     disableLoadElimination();
1305   return Base::visitCallSite(CS);
1306 }
1307 
visitReturnInst(ReturnInst & RI)1308 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1309   // At least one return instruction will be free after inlining.
1310   bool Free = !HasReturn;
1311   HasReturn = true;
1312   return Free;
1313 }
1314 
visitBranchInst(BranchInst & BI)1315 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1316   // We model unconditional branches as essentially free -- they really
1317   // shouldn't exist at all, but handling them makes the behavior of the
1318   // inliner more regular and predictable. Interestingly, conditional branches
1319   // which will fold away are also free.
1320   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1321          dyn_cast_or_null<ConstantInt>(
1322              SimplifiedValues.lookup(BI.getCondition()));
1323 }
1324 
visitSelectInst(SelectInst & SI)1325 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1326   bool CheckSROA = SI.getType()->isPointerTy();
1327   Value *TrueVal = SI.getTrueValue();
1328   Value *FalseVal = SI.getFalseValue();
1329 
1330   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1331   if (!TrueC)
1332     TrueC = SimplifiedValues.lookup(TrueVal);
1333   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1334   if (!FalseC)
1335     FalseC = SimplifiedValues.lookup(FalseVal);
1336   Constant *CondC =
1337       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1338 
1339   if (!CondC) {
1340     // Select C, X, X => X
1341     if (TrueC == FalseC && TrueC) {
1342       SimplifiedValues[&SI] = TrueC;
1343       return true;
1344     }
1345 
1346     if (!CheckSROA)
1347       return Base::visitSelectInst(SI);
1348 
1349     std::pair<Value *, APInt> TrueBaseAndOffset =
1350         ConstantOffsetPtrs.lookup(TrueVal);
1351     std::pair<Value *, APInt> FalseBaseAndOffset =
1352         ConstantOffsetPtrs.lookup(FalseVal);
1353     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1354       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1355 
1356       Value *SROAArg;
1357       DenseMap<Value *, int>::iterator CostIt;
1358       if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1359         SROAArgValues[&SI] = SROAArg;
1360       return true;
1361     }
1362 
1363     return Base::visitSelectInst(SI);
1364   }
1365 
1366   // Select condition is a constant.
1367   Value *SelectedV = CondC->isAllOnesValue()
1368                          ? TrueVal
1369                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1370   if (!SelectedV) {
1371     // Condition is a vector constant that is not all 1s or all 0s.  If all
1372     // operands are constants, ConstantExpr::getSelect() can handle the cases
1373     // such as select vectors.
1374     if (TrueC && FalseC) {
1375       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1376         SimplifiedValues[&SI] = C;
1377         return true;
1378       }
1379     }
1380     return Base::visitSelectInst(SI);
1381   }
1382 
1383   // Condition is either all 1s or all 0s. SI can be simplified.
1384   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1385     SimplifiedValues[&SI] = SelectedC;
1386     return true;
1387   }
1388 
1389   if (!CheckSROA)
1390     return true;
1391 
1392   std::pair<Value *, APInt> BaseAndOffset =
1393       ConstantOffsetPtrs.lookup(SelectedV);
1394   if (BaseAndOffset.first) {
1395     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1396 
1397     Value *SROAArg;
1398     DenseMap<Value *, int>::iterator CostIt;
1399     if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1400       SROAArgValues[&SI] = SROAArg;
1401   }
1402 
1403   return true;
1404 }
1405 
visitSwitchInst(SwitchInst & SI)1406 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1407   // We model unconditional switches as free, see the comments on handling
1408   // branches.
1409   if (isa<ConstantInt>(SI.getCondition()))
1410     return true;
1411   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1412     if (isa<ConstantInt>(V))
1413       return true;
1414 
1415   // Assume the most general case where the switch is lowered into
1416   // either a jump table, bit test, or a balanced binary tree consisting of
1417   // case clusters without merging adjacent clusters with the same
1418   // destination. We do not consider the switches that are lowered with a mix
1419   // of jump table/bit test/binary search tree. The cost of the switch is
1420   // proportional to the size of the tree or the size of jump table range.
1421   //
1422   // NB: We convert large switches which are just used to initialize large phi
1423   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1424   // inlining those. It will prevent inlining in cases where the optimization
1425   // does not (yet) fire.
1426 
1427   // Maximum valid cost increased in this function.
1428   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1429 
1430   // Exit early for a large switch, assuming one case needs at least one
1431   // instruction.
1432   // FIXME: This is not true for a bit test, but ignore such case for now to
1433   // save compile-time.
1434   int64_t CostLowerBound =
1435       std::min((int64_t)CostUpperBound,
1436                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1437 
1438   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1439     Cost = CostLowerBound;
1440     return false;
1441   }
1442 
1443   unsigned JumpTableSize = 0;
1444   unsigned NumCaseCluster =
1445       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1446 
1447   // If suitable for a jump table, consider the cost for the table size and
1448   // branch to destination.
1449   if (JumpTableSize) {
1450     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1451                      4 * InlineConstants::InstrCost;
1452 
1453     Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1454     return false;
1455   }
1456 
1457   // Considering forming a binary search, we should find the number of nodes
1458   // which is same as the number of comparisons when lowered. For a given
1459   // number of clusters, n, we can define a recursive function, f(n), to find
1460   // the number of nodes in the tree. The recursion is :
1461   // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1462   // and f(n) = n, when n <= 3.
1463   // This will lead a binary tree where the leaf should be either f(2) or f(3)
1464   // when n > 3.  So, the number of comparisons from leaves should be n, while
1465   // the number of non-leaf should be :
1466   //   2^(log2(n) - 1) - 1
1467   //   = 2^log2(n) * 2^-1 - 1
1468   //   = n / 2 - 1.
1469   // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1470   // number of comparisons in a simple closed form :
1471   //   n + n / 2 - 1 = n * 3 / 2 - 1
1472   if (NumCaseCluster <= 3) {
1473     // Suppose a comparison includes one compare and one conditional branch.
1474     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1475     return false;
1476   }
1477 
1478   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1479   int64_t SwitchCost =
1480       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1481 
1482   Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1483   return false;
1484 }
1485 
visitIndirectBrInst(IndirectBrInst & IBI)1486 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1487   // We never want to inline functions that contain an indirectbr.  This is
1488   // incorrect because all the blockaddress's (in static global initializers
1489   // for example) would be referring to the original function, and this
1490   // indirect jump would jump from the inlined copy of the function into the
1491   // original function which is extremely undefined behavior.
1492   // FIXME: This logic isn't really right; we can safely inline functions with
1493   // indirectbr's as long as no other function or global references the
1494   // blockaddress of a block within the current function.
1495   HasIndirectBr = true;
1496   return false;
1497 }
1498 
visitResumeInst(ResumeInst & RI)1499 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1500   // FIXME: It's not clear that a single instruction is an accurate model for
1501   // the inline cost of a resume instruction.
1502   return false;
1503 }
1504 
visitCleanupReturnInst(CleanupReturnInst & CRI)1505 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1506   // FIXME: It's not clear that a single instruction is an accurate model for
1507   // the inline cost of a cleanupret instruction.
1508   return false;
1509 }
1510 
visitCatchReturnInst(CatchReturnInst & CRI)1511 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1512   // FIXME: It's not clear that a single instruction is an accurate model for
1513   // the inline cost of a catchret instruction.
1514   return false;
1515 }
1516 
visitUnreachableInst(UnreachableInst & I)1517 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1518   // FIXME: It might be reasonably to discount the cost of instructions leading
1519   // to unreachable as they have the lowest possible impact on both runtime and
1520   // code size.
1521   return true; // No actual code is needed for unreachable.
1522 }
1523 
visitInstruction(Instruction & I)1524 bool CallAnalyzer::visitInstruction(Instruction &I) {
1525   // Some instructions are free. All of the free intrinsics can also be
1526   // handled by SROA, etc.
1527   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1528     return true;
1529 
1530   // We found something we don't understand or can't handle. Mark any SROA-able
1531   // values in the operand list as no longer viable.
1532   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1533     disableSROA(*OI);
1534 
1535   return false;
1536 }
1537 
1538 /// Analyze a basic block for its contribution to the inline cost.
1539 ///
1540 /// This method walks the analyzer over every instruction in the given basic
1541 /// block and accounts for their cost during inlining at this callsite. It
1542 /// aborts early if the threshold has been exceeded or an impossible to inline
1543 /// construct has been detected. It returns false if inlining is no longer
1544 /// viable, and true if inlining remains viable.
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)1545 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1546                                 SmallPtrSetImpl<const Value *> &EphValues) {
1547   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1548     // FIXME: Currently, the number of instructions in a function regardless of
1549     // our ability to simplify them during inline to constants or dead code,
1550     // are actually used by the vector bonus heuristic. As long as that's true,
1551     // we have to special case debug intrinsics here to prevent differences in
1552     // inlining due to debug symbols. Eventually, the number of unsimplified
1553     // instructions shouldn't factor into the cost computation, but until then,
1554     // hack around it here.
1555     if (isa<DbgInfoIntrinsic>(I))
1556       continue;
1557 
1558     // Skip ephemeral values.
1559     if (EphValues.count(&*I))
1560       continue;
1561 
1562     ++NumInstructions;
1563     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1564       ++NumVectorInstructions;
1565 
1566     // If the instruction simplified to a constant, there is no cost to this
1567     // instruction. Visit the instructions using our InstVisitor to account for
1568     // all of the per-instruction logic. The visit tree returns true if we
1569     // consumed the instruction in any way, and false if the instruction's base
1570     // cost should count against inlining.
1571     if (Base::visit(&*I))
1572       ++NumInstructionsSimplified;
1573     else
1574       Cost += InlineConstants::InstrCost;
1575 
1576     using namespace ore;
1577     // If the visit this instruction detected an uninlinable pattern, abort.
1578     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1579         HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) {
1580       if (ORE)
1581         ORE->emit([&]() {
1582           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1583                                           CandidateCS.getInstruction())
1584                  << NV("Callee", &F)
1585                  << " has uninlinable pattern and cost is not fully computed";
1586         });
1587       return false;
1588     }
1589 
1590     // If the caller is a recursive function then we don't want to inline
1591     // functions which allocate a lot of stack space because it would increase
1592     // the caller stack usage dramatically.
1593     if (IsCallerRecursive &&
1594         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1595       if (ORE)
1596         ORE->emit([&]() {
1597           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1598                                           CandidateCS.getInstruction())
1599                  << NV("Callee", &F)
1600                  << " is recursive and allocates too much stack space. Cost is "
1601                     "not fully computed";
1602         });
1603       return false;
1604     }
1605 
1606     // Check if we've past the maximum possible threshold so we don't spin in
1607     // huge basic blocks that will never inline.
1608     if (Cost >= Threshold && !ComputeFullInlineCost)
1609       return false;
1610   }
1611 
1612   return true;
1613 }
1614 
1615 /// Compute the base pointer and cumulative constant offsets for V.
1616 ///
1617 /// This strips all constant offsets off of V, leaving it the base pointer, and
1618 /// accumulates the total constant offset applied in the returned constant. It
1619 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1620 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1621 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1622   if (!V->getType()->isPointerTy())
1623     return nullptr;
1624 
1625   unsigned AS = V->getType()->getPointerAddressSpace();
1626   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1627   APInt Offset = APInt::getNullValue(IntPtrWidth);
1628 
1629   // Even though we don't look through PHI nodes, we could be called on an
1630   // instruction in an unreachable block, which may be on a cycle.
1631   SmallPtrSet<Value *, 4> Visited;
1632   Visited.insert(V);
1633   do {
1634     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1635       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1636         return nullptr;
1637       V = GEP->getPointerOperand();
1638     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1639       V = cast<Operator>(V)->getOperand(0);
1640     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1641       if (GA->isInterposable())
1642         break;
1643       V = GA->getAliasee();
1644     } else {
1645       break;
1646     }
1647     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1648   } while (Visited.insert(V).second);
1649 
1650   Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1651   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1652 }
1653 
1654 /// Find dead blocks due to deleted CFG edges during inlining.
1655 ///
1656 /// If we know the successor of the current block, \p CurrBB, has to be \p
1657 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1658 /// no live incoming CFG edges.  If one block is found to be dead, we can
1659 /// continue growing the dead block list by checking the successors of the dead
1660 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)1661 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1662   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1663     // A CFG edge is dead if the predecessor is dead or the predessor has a
1664     // known successor which is not the one under exam.
1665     return (DeadBlocks.count(Pred) ||
1666             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1667   };
1668 
1669   auto IsNewlyDead = [&](BasicBlock *BB) {
1670     // If all the edges to a block are dead, the block is also dead.
1671     return (!DeadBlocks.count(BB) &&
1672             llvm::all_of(predecessors(BB),
1673                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1674   };
1675 
1676   for (BasicBlock *Succ : successors(CurrBB)) {
1677     if (Succ == NextBB || !IsNewlyDead(Succ))
1678       continue;
1679     SmallVector<BasicBlock *, 4> NewDead;
1680     NewDead.push_back(Succ);
1681     while (!NewDead.empty()) {
1682       BasicBlock *Dead = NewDead.pop_back_val();
1683       if (DeadBlocks.insert(Dead))
1684         // Continue growing the dead block lists.
1685         for (BasicBlock *S : successors(Dead))
1686           if (IsNewlyDead(S))
1687             NewDead.push_back(S);
1688     }
1689   }
1690 }
1691 
1692 /// Analyze a call site for potential inlining.
1693 ///
1694 /// Returns true if inlining this call is viable, and false if it is not
1695 /// viable. It computes the cost and adjusts the threshold based on numerous
1696 /// factors and heuristics. If this method returns false but the computed cost
1697 /// is below the computed threshold, then inlining was forcibly disabled by
1698 /// some artifact of the routine.
analyzeCall(CallSite CS)1699 bool CallAnalyzer::analyzeCall(CallSite CS) {
1700   ++NumCallsAnalyzed;
1701 
1702   // Perform some tweaks to the cost and threshold based on the direct
1703   // callsite information.
1704 
1705   // We want to more aggressively inline vector-dense kernels, so up the
1706   // threshold, and we'll lower it if the % of vector instructions gets too
1707   // low. Note that these bonuses are some what arbitrary and evolved over time
1708   // by accident as much as because they are principled bonuses.
1709   //
1710   // FIXME: It would be nice to remove all such bonuses. At least it would be
1711   // nice to base the bonus values on something more scientific.
1712   assert(NumInstructions == 0);
1713   assert(NumVectorInstructions == 0);
1714 
1715   // Update the threshold based on callsite properties
1716   updateThreshold(CS, F);
1717 
1718   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1719   // this Threshold any time, and cost cannot decrease, we can stop processing
1720   // the rest of the function body.
1721   Threshold += (SingleBBBonus + VectorBonus);
1722 
1723   // Give out bonuses for the callsite, as the instructions setting them up
1724   // will be gone after inlining.
1725   Cost -= getCallsiteCost(CS, DL);
1726 
1727   // If this function uses the coldcc calling convention, prefer not to inline
1728   // it.
1729   if (F.getCallingConv() == CallingConv::Cold)
1730     Cost += InlineConstants::ColdccPenalty;
1731 
1732   // Check if we're done. This can happen due to bonuses and penalties.
1733   if (Cost >= Threshold && !ComputeFullInlineCost)
1734     return false;
1735 
1736   if (F.empty())
1737     return true;
1738 
1739   Function *Caller = CS.getInstruction()->getFunction();
1740   // Check if the caller function is recursive itself.
1741   for (User *U : Caller->users()) {
1742     CallSite Site(U);
1743     if (!Site)
1744       continue;
1745     Instruction *I = Site.getInstruction();
1746     if (I->getFunction() == Caller) {
1747       IsCallerRecursive = true;
1748       break;
1749     }
1750   }
1751 
1752   // Populate our simplified values by mapping from function arguments to call
1753   // arguments with known important simplifications.
1754   CallSite::arg_iterator CAI = CS.arg_begin();
1755   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1756        FAI != FAE; ++FAI, ++CAI) {
1757     assert(CAI != CS.arg_end());
1758     if (Constant *C = dyn_cast<Constant>(CAI))
1759       SimplifiedValues[&*FAI] = C;
1760 
1761     Value *PtrArg = *CAI;
1762     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1763       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1764 
1765       // We can SROA any pointer arguments derived from alloca instructions.
1766       if (isa<AllocaInst>(PtrArg)) {
1767         SROAArgValues[&*FAI] = PtrArg;
1768         SROAArgCosts[PtrArg] = 0;
1769       }
1770     }
1771   }
1772   NumConstantArgs = SimplifiedValues.size();
1773   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1774   NumAllocaArgs = SROAArgValues.size();
1775 
1776   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1777   // the ephemeral values multiple times (and they're completely determined by
1778   // the callee, so this is purely duplicate work).
1779   SmallPtrSet<const Value *, 32> EphValues;
1780   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1781 
1782   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1783   // adding basic blocks of the callee which can be proven to be dead for this
1784   // particular call site in order to get more accurate cost estimates. This
1785   // requires a somewhat heavyweight iteration pattern: we need to walk the
1786   // basic blocks in a breadth-first order as we insert live successors. To
1787   // accomplish this, prioritizing for small iterations because we exit after
1788   // crossing our threshold, we use a small-size optimized SetVector.
1789   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1790                     SmallPtrSet<BasicBlock *, 16>>
1791       BBSetVector;
1792   BBSetVector BBWorklist;
1793   BBWorklist.insert(&F.getEntryBlock());
1794   bool SingleBB = true;
1795   // Note that we *must not* cache the size, this loop grows the worklist.
1796   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1797     // Bail out the moment we cross the threshold. This means we'll under-count
1798     // the cost, but only when undercounting doesn't matter.
1799     if (Cost >= Threshold && !ComputeFullInlineCost)
1800       break;
1801 
1802     BasicBlock *BB = BBWorklist[Idx];
1803     if (BB->empty())
1804       continue;
1805 
1806     // Disallow inlining a blockaddress. A blockaddress only has defined
1807     // behavior for an indirect branch in the same function, and we do not
1808     // currently support inlining indirect branches. But, the inliner may not
1809     // see an indirect branch that ends up being dead code at a particular call
1810     // site. If the blockaddress escapes the function, e.g., via a global
1811     // variable, inlining may lead to an invalid cross-function reference.
1812     if (BB->hasAddressTaken())
1813       return false;
1814 
1815     // Analyze the cost of this block. If we blow through the threshold, this
1816     // returns false, and we can bail on out.
1817     if (!analyzeBlock(BB, EphValues))
1818       return false;
1819 
1820     TerminatorInst *TI = BB->getTerminator();
1821 
1822     // Add in the live successors by first checking whether we have terminator
1823     // that may be simplified based on the values simplified by this call.
1824     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1825       if (BI->isConditional()) {
1826         Value *Cond = BI->getCondition();
1827         if (ConstantInt *SimpleCond =
1828                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1829           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1830           BBWorklist.insert(NextBB);
1831           KnownSuccessors[BB] = NextBB;
1832           findDeadBlocks(BB, NextBB);
1833           continue;
1834         }
1835       }
1836     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1837       Value *Cond = SI->getCondition();
1838       if (ConstantInt *SimpleCond =
1839               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1840         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1841         BBWorklist.insert(NextBB);
1842         KnownSuccessors[BB] = NextBB;
1843         findDeadBlocks(BB, NextBB);
1844         continue;
1845       }
1846     }
1847 
1848     // If we're unable to select a particular successor, just count all of
1849     // them.
1850     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1851          ++TIdx)
1852       BBWorklist.insert(TI->getSuccessor(TIdx));
1853 
1854     // If we had any successors at this point, than post-inlining is likely to
1855     // have them as well. Note that we assume any basic blocks which existed
1856     // due to branches or switches which folded above will also fold after
1857     // inlining.
1858     if (SingleBB && TI->getNumSuccessors() > 1) {
1859       // Take off the bonus we applied to the threshold.
1860       Threshold -= SingleBBBonus;
1861       SingleBB = false;
1862     }
1863   }
1864 
1865   bool OnlyOneCallAndLocalLinkage =
1866       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1867   // If this is a noduplicate call, we can still inline as long as
1868   // inlining this would cause the removal of the caller (so the instruction
1869   // is not actually duplicated, just moved).
1870   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1871     return false;
1872 
1873   // We applied the maximum possible vector bonus at the beginning. Now,
1874   // subtract the excess bonus, if any, from the Threshold before
1875   // comparing against Cost.
1876   if (NumVectorInstructions <= NumInstructions / 10)
1877     Threshold -= VectorBonus;
1878   else if (NumVectorInstructions <= NumInstructions / 2)
1879     Threshold -= VectorBonus/2;
1880 
1881   return Cost < std::max(1, Threshold);
1882 }
1883 
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885 /// Dump stats about this call's analysis.
dump()1886 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1887 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1888   DEBUG_PRINT_STAT(NumConstantArgs);
1889   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1890   DEBUG_PRINT_STAT(NumAllocaArgs);
1891   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1892   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1893   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1894   DEBUG_PRINT_STAT(NumInstructions);
1895   DEBUG_PRINT_STAT(SROACostSavings);
1896   DEBUG_PRINT_STAT(SROACostSavingsLost);
1897   DEBUG_PRINT_STAT(LoadEliminationCost);
1898   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1899   DEBUG_PRINT_STAT(Cost);
1900   DEBUG_PRINT_STAT(Threshold);
1901 #undef DEBUG_PRINT_STAT
1902 }
1903 #endif
1904 
1905 /// Test that there are no attribute conflicts between Caller and Callee
1906 ///        that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1907 static bool functionsHaveCompatibleAttributes(Function *Caller,
1908                                               Function *Callee,
1909                                               TargetTransformInfo &TTI) {
1910   return TTI.areInlineCompatible(Caller, Callee) &&
1911          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1912 }
1913 
getCallsiteCost(CallSite CS,const DataLayout & DL)1914 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1915   int Cost = 0;
1916   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1917     if (CS.isByValArgument(I)) {
1918       // We approximate the number of loads and stores needed by dividing the
1919       // size of the byval type by the target's pointer size.
1920       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1921       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1922       unsigned AS = PTy->getAddressSpace();
1923       unsigned PointerSize = DL.getPointerSizeInBits(AS);
1924       // Ceiling division.
1925       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1926 
1927       // If it generates more than 8 stores it is likely to be expanded as an
1928       // inline memcpy so we take that as an upper bound. Otherwise we assume
1929       // one load and one store per word copied.
1930       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1931       // here instead of a magic number of 8, but it's not available via
1932       // DataLayout.
1933       NumStores = std::min(NumStores, 8U);
1934 
1935       Cost += 2 * NumStores * InlineConstants::InstrCost;
1936     } else {
1937       // For non-byval arguments subtract off one instruction per call
1938       // argument.
1939       Cost += InlineConstants::InstrCost;
1940     }
1941   }
1942   // The call instruction also disappears after inlining.
1943   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1944   return Cost;
1945 }
1946 
getInlineCost(CallSite CS,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1947 InlineCost llvm::getInlineCost(
1948     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1949     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1950     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1951     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1952   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1953                        GetAssumptionCache, GetBFI, PSI, ORE);
1954 }
1955 
getInlineCost(CallSite CS,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1956 InlineCost llvm::getInlineCost(
1957     CallSite CS, Function *Callee, const InlineParams &Params,
1958     TargetTransformInfo &CalleeTTI,
1959     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1960     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1961     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1962 
1963   // Cannot inline indirect calls.
1964   if (!Callee)
1965     return llvm::InlineCost::getNever();
1966 
1967   // Never inline calls with byval arguments that does not have the alloca
1968   // address space. Since byval arguments can be replaced with a copy to an
1969   // alloca, the inlined code would need to be adjusted to handle that the
1970   // argument is in the alloca address space (so it is a little bit complicated
1971   // to solve).
1972   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1973   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1974     if (CS.isByValArgument(I)) {
1975       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1976       if (PTy->getAddressSpace() != AllocaAS)
1977         return llvm::InlineCost::getNever();
1978     }
1979 
1980   // Calls to functions with always-inline attributes should be inlined
1981   // whenever possible.
1982   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1983     if (isInlineViable(*Callee))
1984       return llvm::InlineCost::getAlways();
1985     return llvm::InlineCost::getNever();
1986   }
1987 
1988   // Never inline functions with conflicting attributes (unless callee has
1989   // always-inline attribute).
1990   Function *Caller = CS.getCaller();
1991   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1992     return llvm::InlineCost::getNever();
1993 
1994   // Don't inline this call if the caller has the optnone attribute.
1995   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1996     return llvm::InlineCost::getNever();
1997 
1998   // Don't inline a function that treats null pointer as valid into a caller
1999   // that does not have this attribute.
2000   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2001     return llvm::InlineCost::getNever();
2002 
2003   // Don't inline functions which can be interposed at link-time.  Don't inline
2004   // functions marked noinline or call sites marked noinline.
2005   // Note: inlining non-exact non-interposable functions is fine, since we know
2006   // we have *a* correct implementation of the source level function.
2007   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
2008       CS.isNoInline())
2009     return llvm::InlineCost::getNever();
2010 
2011   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2012                           << "... (caller:" << Caller->getName() << ")\n");
2013 
2014   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2015                   Params);
2016   bool ShouldInline = CA.analyzeCall(CS);
2017 
2018   LLVM_DEBUG(CA.dump());
2019 
2020   // Check if there was a reason to force inlining or no inlining.
2021   if (!ShouldInline && CA.getCost() < CA.getThreshold())
2022     return InlineCost::getNever();
2023   if (ShouldInline && CA.getCost() >= CA.getThreshold())
2024     return InlineCost::getAlways();
2025 
2026   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2027 }
2028 
isInlineViable(Function & F)2029 bool llvm::isInlineViable(Function &F) {
2030   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2031   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2032     // Disallow inlining of functions which contain indirect branches or
2033     // blockaddresses.
2034     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2035       return false;
2036 
2037     for (auto &II : *BI) {
2038       CallSite CS(&II);
2039       if (!CS)
2040         continue;
2041 
2042       // Disallow recursive calls.
2043       if (&F == CS.getCalledFunction())
2044         return false;
2045 
2046       // Disallow calls which expose returns-twice to a function not previously
2047       // attributed as such.
2048       if (!ReturnsTwice && CS.isCall() &&
2049           cast<CallInst>(CS.getInstruction())->canReturnTwice())
2050         return false;
2051 
2052       if (CS.getCalledFunction())
2053         switch (CS.getCalledFunction()->getIntrinsicID()) {
2054         default:
2055           break;
2056         // Disallow inlining of @llvm.icall.branch.funnel because current
2057         // backend can't separate call targets from call arguments.
2058         case llvm::Intrinsic::icall_branch_funnel:
2059         // Disallow inlining functions that call @llvm.localescape. Doing this
2060         // correctly would require major changes to the inliner.
2061         case llvm::Intrinsic::localescape:
2062         // Disallow inlining of functions that access VarArgs.
2063         case llvm::Intrinsic::vastart:
2064         case llvm::Intrinsic::vaend:
2065           return false;
2066         }
2067     }
2068   }
2069 
2070   return true;
2071 }
2072 
2073 // APIs to create InlineParams based on command line flags and/or other
2074 // parameters.
2075 
getInlineParams(int Threshold)2076 InlineParams llvm::getInlineParams(int Threshold) {
2077   InlineParams Params;
2078 
2079   // This field is the threshold to use for a callee by default. This is
2080   // derived from one or more of:
2081   //  * optimization or size-optimization levels,
2082   //  * a value passed to createFunctionInliningPass function, or
2083   //  * the -inline-threshold flag.
2084   //  If the -inline-threshold flag is explicitly specified, that is used
2085   //  irrespective of anything else.
2086   if (InlineThreshold.getNumOccurrences() > 0)
2087     Params.DefaultThreshold = InlineThreshold;
2088   else
2089     Params.DefaultThreshold = Threshold;
2090 
2091   // Set the HintThreshold knob from the -inlinehint-threshold.
2092   Params.HintThreshold = HintThreshold;
2093 
2094   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2095   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2096 
2097   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2098   // populate LocallyHotCallSiteThreshold. Later, we populate
2099   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2100   // we know that optimization level is O3 (in the getInlineParams variant that
2101   // takes the opt and size levels).
2102   // FIXME: Remove this check (and make the assignment unconditional) after
2103   // addressing size regression issues at O2.
2104   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2105     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2106 
2107   // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2108   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2109 
2110   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2111   // -inlinehint-threshold commandline option is not explicitly given. If that
2112   // option is present, then its value applies even for callees with size and
2113   // minsize attributes.
2114   // If the -inline-threshold is not specified, set the ColdThreshold from the
2115   // -inlinecold-threshold even if it is not explicitly passed. If
2116   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2117   // explicitly specified to set the ColdThreshold knob
2118   if (InlineThreshold.getNumOccurrences() == 0) {
2119     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2120     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2121     Params.ColdThreshold = ColdThreshold;
2122   } else if (ColdThreshold.getNumOccurrences() > 0) {
2123     Params.ColdThreshold = ColdThreshold;
2124   }
2125   return Params;
2126 }
2127 
getInlineParams()2128 InlineParams llvm::getInlineParams() {
2129   return getInlineParams(InlineThreshold);
2130 }
2131 
2132 // Compute the default threshold for inlining based on the opt level and the
2133 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)2134 static int computeThresholdFromOptLevels(unsigned OptLevel,
2135                                          unsigned SizeOptLevel) {
2136   if (OptLevel > 2)
2137     return InlineConstants::OptAggressiveThreshold;
2138   if (SizeOptLevel == 1) // -Os
2139     return InlineConstants::OptSizeThreshold;
2140   if (SizeOptLevel == 2) // -Oz
2141     return InlineConstants::OptMinSizeThreshold;
2142   return InlineThreshold;
2143 }
2144 
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)2145 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2146   auto Params =
2147       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2148   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2149   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2150   // when it is specified explicitly.
2151   if (OptLevel > 2)
2152     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2153   return Params;
2154 }
2155