1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This file a TargetTransformInfo::Concept conforming object specific to the
12 /// AMDGPU target machine. It uses the target's detailed information to
13 /// provide more precise answers to certain TTI queries, while letting the
14 /// target independent and default TTI implementations handle the rest.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
20 
21 #include "AMDGPU.h"
22 #include "AMDGPUSubtarget.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/ArrayRef.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/CodeGen/BasicTTIImpl.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/MC/SubtargetFeature.h"
31 #include "llvm/Support/MathExtras.h"
32 #include <cassert>
33 
34 namespace llvm {
35 
36 class AMDGPUTargetLowering;
37 class Loop;
38 class ScalarEvolution;
39 class Type;
40 class Value;
41 
42 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
43   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
44   using TTI = TargetTransformInfo;
45 
46   friend BaseT;
47 
48   Triple TargetTriple;
49 
50 public:
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)51   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
52     : BaseT(TM, F.getParent()->getDataLayout()),
53       TargetTriple(TM->getTargetTriple()) {}
54 
55   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
56                                TTI::UnrollingPreferences &UP);
57 };
58 
59 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
60   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
61   using TTI = TargetTransformInfo;
62 
63   friend BaseT;
64 
65   const GCNSubtarget *ST;
66   const AMDGPUTargetLowering *TLI;
67   AMDGPUTTIImpl CommonTTI;
68   bool IsGraphicsShader;
69 
70   const FeatureBitset InlineFeatureIgnoreList = {
71     // Codegen control options which don't matter.
72     AMDGPU::FeatureEnableLoadStoreOpt,
73     AMDGPU::FeatureEnableSIScheduler,
74     AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
75     AMDGPU::FeatureFlatForGlobal,
76     AMDGPU::FeaturePromoteAlloca,
77     AMDGPU::FeatureUnalignedBufferAccess,
78     AMDGPU::FeatureUnalignedScratchAccess,
79 
80     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
81     AMDGPU::FeatureDebuggerEmitPrologue,
82     AMDGPU::FeatureDebuggerInsertNops,
83 
84     // Property of the kernel/environment which can't actually differ.
85     AMDGPU::FeatureSGPRInitBug,
86     AMDGPU::FeatureXNACK,
87     AMDGPU::FeatureTrapHandler,
88 
89     // Perf-tuning features
90     AMDGPU::FeatureFastFMAF32,
91     AMDGPU::HalfRate64Ops
92   };
93 
getST()94   const GCNSubtarget *getST() const { return ST; }
getTLI()95   const AMDGPUTargetLowering *getTLI() const { return TLI; }
96 
getFullRateInstrCost()97   static inline int getFullRateInstrCost() {
98     return TargetTransformInfo::TCC_Basic;
99   }
100 
getHalfRateInstrCost()101   static inline int getHalfRateInstrCost() {
102     return 2 * TargetTransformInfo::TCC_Basic;
103   }
104 
105   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
106   // should be 2 or 4.
getQuarterRateInstrCost()107   static inline int getQuarterRateInstrCost() {
108     return 3 * TargetTransformInfo::TCC_Basic;
109   }
110 
111    // On some parts, normal fp64 operations are half rate, and others
112    // quarter. This also applies to some integer operations.
get64BitInstrCost()113   inline int get64BitInstrCost() const {
114     return ST->hasHalfRate64Ops() ?
115       getHalfRateInstrCost() : getQuarterRateInstrCost();
116   }
117 
118 public:
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)119   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
120     : BaseT(TM, F.getParent()->getDataLayout()),
121       ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
122       TLI(ST->getTargetLowering()),
123       CommonTTI(TM, F),
124       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
125 
hasBranchDivergence()126   bool hasBranchDivergence() { return true; }
127 
128   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
129                                TTI::UnrollingPreferences &UP);
130 
getPopcntSupport(unsigned TyWidth)131   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
132     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
133     return TTI::PSK_FastHardware;
134   }
135 
136   unsigned getHardwareNumberOfRegisters(bool Vector) const;
137   unsigned getNumberOfRegisters(bool Vector) const;
138   unsigned getRegisterBitWidth(bool Vector) const;
139   unsigned getMinVectorRegisterBitWidth() const;
140   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
141                                unsigned ChainSizeInBytes,
142                                VectorType *VecTy) const;
143   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
144                                 unsigned ChainSizeInBytes,
145                                 VectorType *VecTy) const;
146   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
147 
148   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
149                                   unsigned Alignment,
150                                   unsigned AddrSpace) const;
151   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
152                                    unsigned Alignment,
153                                    unsigned AddrSpace) const;
154   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
155                                     unsigned Alignment,
156                                     unsigned AddrSpace) const;
157 
158   unsigned getMaxInterleaveFactor(unsigned VF);
159 
160   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
161 
162   int getArithmeticInstrCost(
163     unsigned Opcode, Type *Ty,
164     TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
165     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
166     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
167     TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
168     ArrayRef<const Value *> Args = ArrayRef<const Value *>());
169 
170   unsigned getCFInstrCost(unsigned Opcode);
171 
172   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
173   bool isSourceOfDivergence(const Value *V) const;
174   bool isAlwaysUniform(const Value *V) const;
175 
getFlatAddressSpace()176   unsigned getFlatAddressSpace() const {
177     // Don't bother running InferAddressSpaces pass on graphics shaders which
178     // don't use flat addressing.
179     if (IsGraphicsShader)
180       return -1;
181     return ST->hasFlatAddressSpace() ?
182       ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
183   }
184 
getVectorSplitCost()185   unsigned getVectorSplitCost() { return 0; }
186 
187   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
188                           Type *SubTp);
189 
190   bool areInlineCompatible(const Function *Caller,
191                            const Function *Callee) const;
192 
getInliningThresholdMultiplier()193   unsigned getInliningThresholdMultiplier() { return 9; }
194 
195   int getArithmeticReductionCost(unsigned Opcode,
196                                  Type *Ty,
197                                  bool IsPairwise);
198   int getMinMaxReductionCost(Type *Ty, Type *CondTy,
199                              bool IsPairwiseForm,
200                              bool IsUnsigned);
201 };
202 
203 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
204   using BaseT = BasicTTIImplBase<R600TTIImpl>;
205   using TTI = TargetTransformInfo;
206 
207   friend BaseT;
208 
209   const R600Subtarget *ST;
210   const AMDGPUTargetLowering *TLI;
211   AMDGPUTTIImpl CommonTTI;
212 
213 public:
R600TTIImpl(const AMDGPUTargetMachine * TM,const Function & F)214   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
215     : BaseT(TM, F.getParent()->getDataLayout()),
216       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
217       TLI(ST->getTargetLowering()),
218       CommonTTI(TM, F)	{}
219 
getST()220   const R600Subtarget *getST() const { return ST; }
getTLI()221   const AMDGPUTargetLowering *getTLI() const { return TLI; }
222 
223   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
224                                TTI::UnrollingPreferences &UP);
225   unsigned getHardwareNumberOfRegisters(bool Vec) const;
226   unsigned getNumberOfRegisters(bool Vec) const;
227   unsigned getRegisterBitWidth(bool Vector) const;
228   unsigned getMinVectorRegisterBitWidth() const;
229   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
230   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
231                                   unsigned AddrSpace) const;
232   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
233 		                   unsigned Alignment,
234                                    unsigned AddrSpace) const;
235   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
236                                     unsigned Alignment,
237                                     unsigned AddrSpace) const;
238   unsigned getMaxInterleaveFactor(unsigned VF);
239   unsigned getCFInstrCost(unsigned Opcode);
240   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
241 };
242 
243 } // end namespace llvm
244 
245 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
246