1 //
2 //                     The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelLowering.h"
15 #include "NVPTX.h"
16 #include "NVPTXTargetMachine.h"
17 #include "NVPTXTargetObjectFile.h"
18 #include "NVPTXUtilities.h"
19 #include "llvm/CodeGen/Analysis.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstrBuilder.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/GlobalValue.h"
29 #include "llvm/IR/IntrinsicInst.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/MC/MCSectionELF.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include <sstream>
39 
40 #undef DEBUG_TYPE
41 #define DEBUG_TYPE "nvptx-lower"
42 
43 using namespace llvm;
44 
45 static unsigned int uniqueCallSite = 0;
46 
47 static cl::opt<bool> sched4reg(
48     "nvptx-sched4reg",
49     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
50 
51 static cl::opt<unsigned>
52 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
53                     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
54                              " 1: do it  2: do it aggressively"),
55                     cl::init(2));
56 
IsPTXVectorType(MVT VT)57 static bool IsPTXVectorType(MVT VT) {
58   switch (VT.SimpleTy) {
59   default:
60     return false;
61   case MVT::v2i1:
62   case MVT::v4i1:
63   case MVT::v2i8:
64   case MVT::v4i8:
65   case MVT::v2i16:
66   case MVT::v4i16:
67   case MVT::v2i32:
68   case MVT::v4i32:
69   case MVT::v2i64:
70   case MVT::v2f32:
71   case MVT::v4f32:
72   case MVT::v2f64:
73     return true;
74   }
75 }
76 
77 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
78 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
79 /// into their primitive components.
80 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
81 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
82 /// LowerCall, and LowerReturn.
ComputePTXValueVTs(const TargetLowering & TLI,const DataLayout & DL,Type * Ty,SmallVectorImpl<EVT> & ValueVTs,SmallVectorImpl<uint64_t> * Offsets=nullptr,uint64_t StartingOffset=0)83 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
84                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
85                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
86                                uint64_t StartingOffset = 0) {
87   SmallVector<EVT, 16> TempVTs;
88   SmallVector<uint64_t, 16> TempOffsets;
89 
90   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
91   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
92     EVT VT = TempVTs[i];
93     uint64_t Off = TempOffsets[i];
94     if (VT.isVector())
95       for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
96         ValueVTs.push_back(VT.getVectorElementType());
97         if (Offsets)
98           Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
99       }
100     else {
101       ValueVTs.push_back(VT);
102       if (Offsets)
103         Offsets->push_back(Off);
104     }
105   }
106 }
107 
108 // NVPTXTargetLowering Constructor.
NVPTXTargetLowering(const NVPTXTargetMachine & TM,const NVPTXSubtarget & STI)109 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
110                                          const NVPTXSubtarget &STI)
111     : TargetLowering(TM), nvTM(&TM), STI(STI) {
112 
113   // always lower memset, memcpy, and memmove intrinsics to load/store
114   // instructions, rather
115   // then generating calls to memset, mempcy or memmove.
116   MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
117   MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
118   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
119 
120   setBooleanContents(ZeroOrNegativeOneBooleanContent);
121   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
122 
123   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
124   // condition branches.
125   setJumpIsExpensive(true);
126 
127   // Wide divides are _very_ slow. Try to reduce the width of the divide if
128   // possible.
129   addBypassSlowDiv(64, 32);
130 
131   // By default, use the Source scheduling
132   if (sched4reg)
133     setSchedulingPreference(Sched::RegPressure);
134   else
135     setSchedulingPreference(Sched::Source);
136 
137   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
138   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
139   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
140   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
141   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
142   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
143 
144   // Operations not directly supported by NVPTX.
145   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
146   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
147   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
148   setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
149   setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
150   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
151   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
152   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
153   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
154   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
155   setOperationAction(ISD::BR_CC, MVT::i8, Expand);
156   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
157   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
158   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
159   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
160   // For others we will expand to a SHL/SRA pair.
161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
162   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
163   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
164   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
165   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
166 
167   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
168   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
169   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
170   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
171   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
172   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
173 
174   if (STI.hasROT64()) {
175     setOperationAction(ISD::ROTL, MVT::i64, Legal);
176     setOperationAction(ISD::ROTR, MVT::i64, Legal);
177   } else {
178     setOperationAction(ISD::ROTL, MVT::i64, Expand);
179     setOperationAction(ISD::ROTR, MVT::i64, Expand);
180   }
181   if (STI.hasROT32()) {
182     setOperationAction(ISD::ROTL, MVT::i32, Legal);
183     setOperationAction(ISD::ROTR, MVT::i32, Legal);
184   } else {
185     setOperationAction(ISD::ROTL, MVT::i32, Expand);
186     setOperationAction(ISD::ROTR, MVT::i32, Expand);
187   }
188 
189   setOperationAction(ISD::ROTL, MVT::i16, Expand);
190   setOperationAction(ISD::ROTR, MVT::i16, Expand);
191   setOperationAction(ISD::ROTL, MVT::i8, Expand);
192   setOperationAction(ISD::ROTR, MVT::i8, Expand);
193   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
194   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
195   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
196 
197   // Indirect branch is not supported.
198   // This also disables Jump Table creation.
199   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
200   setOperationAction(ISD::BRIND, MVT::Other, Expand);
201 
202   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
203   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
204 
205   // We want to legalize constant related memmove and memcopy
206   // intrinsics.
207   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
208 
209   // Turn FP extload into load/fextend
210   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
211   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
212   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
213   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
214   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
215   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
216   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
217   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
218   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
219   // Turn FP truncstore into trunc + store.
220   // FIXME: vector types should also be expanded
221   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
222   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
223   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
224 
225   // PTX does not support load / store predicate registers
226   setOperationAction(ISD::LOAD, MVT::i1, Custom);
227   setOperationAction(ISD::STORE, MVT::i1, Custom);
228 
229   for (MVT VT : MVT::integer_valuetypes()) {
230     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
231     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
232     setTruncStoreAction(VT, MVT::i1, Expand);
233   }
234 
235   // This is legal in NVPTX
236   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
237   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
238 
239   // TRAP can be lowered to PTX trap
240   setOperationAction(ISD::TRAP, MVT::Other, Legal);
241 
242   setOperationAction(ISD::ADDC, MVT::i64, Expand);
243   setOperationAction(ISD::ADDE, MVT::i64, Expand);
244 
245   // Register custom handling for vector loads/stores
246   for (MVT VT : MVT::vector_valuetypes()) {
247     if (IsPTXVectorType(VT)) {
248       setOperationAction(ISD::LOAD, VT, Custom);
249       setOperationAction(ISD::STORE, VT, Custom);
250       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
251     }
252   }
253 
254   // Custom handling for i8 intrinsics
255   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
256 
257   setOperationAction(ISD::CTLZ, MVT::i16, Legal);
258   setOperationAction(ISD::CTLZ, MVT::i32, Legal);
259   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
260   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
261   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
262   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
263   setOperationAction(ISD::CTPOP, MVT::i16, Legal);
264   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
265   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
266 
267   // PTX does not directly support SELP of i1, so promote to i32 first
268   setOperationAction(ISD::SELECT, MVT::i1, Custom);
269 
270   // PTX cannot multiply two i64s in a single instruction.
271   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
272   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
273 
274   // We have some custom DAG combine patterns for these nodes
275   setTargetDAGCombine(ISD::ADD);
276   setTargetDAGCombine(ISD::AND);
277   setTargetDAGCombine(ISD::FADD);
278   setTargetDAGCombine(ISD::MUL);
279   setTargetDAGCombine(ISD::SHL);
280   setTargetDAGCombine(ISD::SELECT);
281 
282   // Now deduce the information based on the above mentioned
283   // actions
284   computeRegisterProperties(STI.getRegisterInfo());
285 }
286 
getTargetNodeName(unsigned Opcode) const287 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
288   switch ((NVPTXISD::NodeType)Opcode) {
289   case NVPTXISD::FIRST_NUMBER:
290     break;
291   case NVPTXISD::CALL:
292     return "NVPTXISD::CALL";
293   case NVPTXISD::RET_FLAG:
294     return "NVPTXISD::RET_FLAG";
295   case NVPTXISD::LOAD_PARAM:
296     return "NVPTXISD::LOAD_PARAM";
297   case NVPTXISD::Wrapper:
298     return "NVPTXISD::Wrapper";
299   case NVPTXISD::DeclareParam:
300     return "NVPTXISD::DeclareParam";
301   case NVPTXISD::DeclareScalarParam:
302     return "NVPTXISD::DeclareScalarParam";
303   case NVPTXISD::DeclareRet:
304     return "NVPTXISD::DeclareRet";
305   case NVPTXISD::DeclareScalarRet:
306     return "NVPTXISD::DeclareScalarRet";
307   case NVPTXISD::DeclareRetParam:
308     return "NVPTXISD::DeclareRetParam";
309   case NVPTXISD::PrintCall:
310     return "NVPTXISD::PrintCall";
311   case NVPTXISD::PrintConvergentCall:
312     return "NVPTXISD::PrintConvergentCall";
313   case NVPTXISD::PrintCallUni:
314     return "NVPTXISD::PrintCallUni";
315   case NVPTXISD::PrintConvergentCallUni:
316     return "NVPTXISD::PrintConvergentCallUni";
317   case NVPTXISD::LoadParam:
318     return "NVPTXISD::LoadParam";
319   case NVPTXISD::LoadParamV2:
320     return "NVPTXISD::LoadParamV2";
321   case NVPTXISD::LoadParamV4:
322     return "NVPTXISD::LoadParamV4";
323   case NVPTXISD::StoreParam:
324     return "NVPTXISD::StoreParam";
325   case NVPTXISD::StoreParamV2:
326     return "NVPTXISD::StoreParamV2";
327   case NVPTXISD::StoreParamV4:
328     return "NVPTXISD::StoreParamV4";
329   case NVPTXISD::StoreParamS32:
330     return "NVPTXISD::StoreParamS32";
331   case NVPTXISD::StoreParamU32:
332     return "NVPTXISD::StoreParamU32";
333   case NVPTXISD::CallArgBegin:
334     return "NVPTXISD::CallArgBegin";
335   case NVPTXISD::CallArg:
336     return "NVPTXISD::CallArg";
337   case NVPTXISD::LastCallArg:
338     return "NVPTXISD::LastCallArg";
339   case NVPTXISD::CallArgEnd:
340     return "NVPTXISD::CallArgEnd";
341   case NVPTXISD::CallVoid:
342     return "NVPTXISD::CallVoid";
343   case NVPTXISD::CallVal:
344     return "NVPTXISD::CallVal";
345   case NVPTXISD::CallSymbol:
346     return "NVPTXISD::CallSymbol";
347   case NVPTXISD::Prototype:
348     return "NVPTXISD::Prototype";
349   case NVPTXISD::MoveParam:
350     return "NVPTXISD::MoveParam";
351   case NVPTXISD::StoreRetval:
352     return "NVPTXISD::StoreRetval";
353   case NVPTXISD::StoreRetvalV2:
354     return "NVPTXISD::StoreRetvalV2";
355   case NVPTXISD::StoreRetvalV4:
356     return "NVPTXISD::StoreRetvalV4";
357   case NVPTXISD::PseudoUseParam:
358     return "NVPTXISD::PseudoUseParam";
359   case NVPTXISD::RETURN:
360     return "NVPTXISD::RETURN";
361   case NVPTXISD::CallSeqBegin:
362     return "NVPTXISD::CallSeqBegin";
363   case NVPTXISD::CallSeqEnd:
364     return "NVPTXISD::CallSeqEnd";
365   case NVPTXISD::CallPrototype:
366     return "NVPTXISD::CallPrototype";
367   case NVPTXISD::LoadV2:
368     return "NVPTXISD::LoadV2";
369   case NVPTXISD::LoadV4:
370     return "NVPTXISD::LoadV4";
371   case NVPTXISD::LDGV2:
372     return "NVPTXISD::LDGV2";
373   case NVPTXISD::LDGV4:
374     return "NVPTXISD::LDGV4";
375   case NVPTXISD::LDUV2:
376     return "NVPTXISD::LDUV2";
377   case NVPTXISD::LDUV4:
378     return "NVPTXISD::LDUV4";
379   case NVPTXISD::StoreV2:
380     return "NVPTXISD::StoreV2";
381   case NVPTXISD::StoreV4:
382     return "NVPTXISD::StoreV4";
383   case NVPTXISD::FUN_SHFL_CLAMP:
384     return "NVPTXISD::FUN_SHFL_CLAMP";
385   case NVPTXISD::FUN_SHFR_CLAMP:
386     return "NVPTXISD::FUN_SHFR_CLAMP";
387   case NVPTXISD::IMAD:
388     return "NVPTXISD::IMAD";
389   case NVPTXISD::Dummy:
390     return "NVPTXISD::Dummy";
391   case NVPTXISD::MUL_WIDE_SIGNED:
392     return "NVPTXISD::MUL_WIDE_SIGNED";
393   case NVPTXISD::MUL_WIDE_UNSIGNED:
394     return "NVPTXISD::MUL_WIDE_UNSIGNED";
395   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
396   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
397   case NVPTXISD::Tex1DFloatFloatLevel:
398     return "NVPTXISD::Tex1DFloatFloatLevel";
399   case NVPTXISD::Tex1DFloatFloatGrad:
400     return "NVPTXISD::Tex1DFloatFloatGrad";
401   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
402   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
403   case NVPTXISD::Tex1DS32FloatLevel:
404     return "NVPTXISD::Tex1DS32FloatLevel";
405   case NVPTXISD::Tex1DS32FloatGrad:
406     return "NVPTXISD::Tex1DS32FloatGrad";
407   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
408   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
409   case NVPTXISD::Tex1DU32FloatLevel:
410     return "NVPTXISD::Tex1DU32FloatLevel";
411   case NVPTXISD::Tex1DU32FloatGrad:
412     return "NVPTXISD::Tex1DU32FloatGrad";
413   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
414   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
415   case NVPTXISD::Tex1DArrayFloatFloatLevel:
416     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
417   case NVPTXISD::Tex1DArrayFloatFloatGrad:
418     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
419   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
420   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
421   case NVPTXISD::Tex1DArrayS32FloatLevel:
422     return "NVPTXISD::Tex1DArrayS32FloatLevel";
423   case NVPTXISD::Tex1DArrayS32FloatGrad:
424     return "NVPTXISD::Tex1DArrayS32FloatGrad";
425   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
426   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
427   case NVPTXISD::Tex1DArrayU32FloatLevel:
428     return "NVPTXISD::Tex1DArrayU32FloatLevel";
429   case NVPTXISD::Tex1DArrayU32FloatGrad:
430     return "NVPTXISD::Tex1DArrayU32FloatGrad";
431   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
432   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
433   case NVPTXISD::Tex2DFloatFloatLevel:
434     return "NVPTXISD::Tex2DFloatFloatLevel";
435   case NVPTXISD::Tex2DFloatFloatGrad:
436     return "NVPTXISD::Tex2DFloatFloatGrad";
437   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
438   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
439   case NVPTXISD::Tex2DS32FloatLevel:
440     return "NVPTXISD::Tex2DS32FloatLevel";
441   case NVPTXISD::Tex2DS32FloatGrad:
442     return "NVPTXISD::Tex2DS32FloatGrad";
443   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
444   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
445   case NVPTXISD::Tex2DU32FloatLevel:
446     return "NVPTXISD::Tex2DU32FloatLevel";
447   case NVPTXISD::Tex2DU32FloatGrad:
448     return "NVPTXISD::Tex2DU32FloatGrad";
449   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
450   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
451   case NVPTXISD::Tex2DArrayFloatFloatLevel:
452     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
453   case NVPTXISD::Tex2DArrayFloatFloatGrad:
454     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
455   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
456   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
457   case NVPTXISD::Tex2DArrayS32FloatLevel:
458     return "NVPTXISD::Tex2DArrayS32FloatLevel";
459   case NVPTXISD::Tex2DArrayS32FloatGrad:
460     return "NVPTXISD::Tex2DArrayS32FloatGrad";
461   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
462   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
463   case NVPTXISD::Tex2DArrayU32FloatLevel:
464     return "NVPTXISD::Tex2DArrayU32FloatLevel";
465   case NVPTXISD::Tex2DArrayU32FloatGrad:
466     return "NVPTXISD::Tex2DArrayU32FloatGrad";
467   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
468   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
469   case NVPTXISD::Tex3DFloatFloatLevel:
470     return "NVPTXISD::Tex3DFloatFloatLevel";
471   case NVPTXISD::Tex3DFloatFloatGrad:
472     return "NVPTXISD::Tex3DFloatFloatGrad";
473   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
474   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
475   case NVPTXISD::Tex3DS32FloatLevel:
476     return "NVPTXISD::Tex3DS32FloatLevel";
477   case NVPTXISD::Tex3DS32FloatGrad:
478     return "NVPTXISD::Tex3DS32FloatGrad";
479   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
480   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
481   case NVPTXISD::Tex3DU32FloatLevel:
482     return "NVPTXISD::Tex3DU32FloatLevel";
483   case NVPTXISD::Tex3DU32FloatGrad:
484     return "NVPTXISD::Tex3DU32FloatGrad";
485   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
486   case NVPTXISD::TexCubeFloatFloatLevel:
487     return "NVPTXISD::TexCubeFloatFloatLevel";
488   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
489   case NVPTXISD::TexCubeS32FloatLevel:
490     return "NVPTXISD::TexCubeS32FloatLevel";
491   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
492   case NVPTXISD::TexCubeU32FloatLevel:
493     return "NVPTXISD::TexCubeU32FloatLevel";
494   case NVPTXISD::TexCubeArrayFloatFloat:
495     return "NVPTXISD::TexCubeArrayFloatFloat";
496   case NVPTXISD::TexCubeArrayFloatFloatLevel:
497     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
498   case NVPTXISD::TexCubeArrayS32Float:
499     return "NVPTXISD::TexCubeArrayS32Float";
500   case NVPTXISD::TexCubeArrayS32FloatLevel:
501     return "NVPTXISD::TexCubeArrayS32FloatLevel";
502   case NVPTXISD::TexCubeArrayU32Float:
503     return "NVPTXISD::TexCubeArrayU32Float";
504   case NVPTXISD::TexCubeArrayU32FloatLevel:
505     return "NVPTXISD::TexCubeArrayU32FloatLevel";
506   case NVPTXISD::Tld4R2DFloatFloat:
507     return "NVPTXISD::Tld4R2DFloatFloat";
508   case NVPTXISD::Tld4G2DFloatFloat:
509     return "NVPTXISD::Tld4G2DFloatFloat";
510   case NVPTXISD::Tld4B2DFloatFloat:
511     return "NVPTXISD::Tld4B2DFloatFloat";
512   case NVPTXISD::Tld4A2DFloatFloat:
513     return "NVPTXISD::Tld4A2DFloatFloat";
514   case NVPTXISD::Tld4R2DS64Float:
515     return "NVPTXISD::Tld4R2DS64Float";
516   case NVPTXISD::Tld4G2DS64Float:
517     return "NVPTXISD::Tld4G2DS64Float";
518   case NVPTXISD::Tld4B2DS64Float:
519     return "NVPTXISD::Tld4B2DS64Float";
520   case NVPTXISD::Tld4A2DS64Float:
521     return "NVPTXISD::Tld4A2DS64Float";
522   case NVPTXISD::Tld4R2DU64Float:
523     return "NVPTXISD::Tld4R2DU64Float";
524   case NVPTXISD::Tld4G2DU64Float:
525     return "NVPTXISD::Tld4G2DU64Float";
526   case NVPTXISD::Tld4B2DU64Float:
527     return "NVPTXISD::Tld4B2DU64Float";
528   case NVPTXISD::Tld4A2DU64Float:
529     return "NVPTXISD::Tld4A2DU64Float";
530 
531   case NVPTXISD::TexUnified1DFloatS32:
532     return "NVPTXISD::TexUnified1DFloatS32";
533   case NVPTXISD::TexUnified1DFloatFloat:
534     return "NVPTXISD::TexUnified1DFloatFloat";
535   case NVPTXISD::TexUnified1DFloatFloatLevel:
536     return "NVPTXISD::TexUnified1DFloatFloatLevel";
537   case NVPTXISD::TexUnified1DFloatFloatGrad:
538     return "NVPTXISD::TexUnified1DFloatFloatGrad";
539   case NVPTXISD::TexUnified1DS32S32:
540     return "NVPTXISD::TexUnified1DS32S32";
541   case NVPTXISD::TexUnified1DS32Float:
542     return "NVPTXISD::TexUnified1DS32Float";
543   case NVPTXISD::TexUnified1DS32FloatLevel:
544     return "NVPTXISD::TexUnified1DS32FloatLevel";
545   case NVPTXISD::TexUnified1DS32FloatGrad:
546     return "NVPTXISD::TexUnified1DS32FloatGrad";
547   case NVPTXISD::TexUnified1DU32S32:
548     return "NVPTXISD::TexUnified1DU32S32";
549   case NVPTXISD::TexUnified1DU32Float:
550     return "NVPTXISD::TexUnified1DU32Float";
551   case NVPTXISD::TexUnified1DU32FloatLevel:
552     return "NVPTXISD::TexUnified1DU32FloatLevel";
553   case NVPTXISD::TexUnified1DU32FloatGrad:
554     return "NVPTXISD::TexUnified1DU32FloatGrad";
555   case NVPTXISD::TexUnified1DArrayFloatS32:
556     return "NVPTXISD::TexUnified1DArrayFloatS32";
557   case NVPTXISD::TexUnified1DArrayFloatFloat:
558     return "NVPTXISD::TexUnified1DArrayFloatFloat";
559   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
560     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
561   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
562     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
563   case NVPTXISD::TexUnified1DArrayS32S32:
564     return "NVPTXISD::TexUnified1DArrayS32S32";
565   case NVPTXISD::TexUnified1DArrayS32Float:
566     return "NVPTXISD::TexUnified1DArrayS32Float";
567   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
568     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
569   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
570     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
571   case NVPTXISD::TexUnified1DArrayU32S32:
572     return "NVPTXISD::TexUnified1DArrayU32S32";
573   case NVPTXISD::TexUnified1DArrayU32Float:
574     return "NVPTXISD::TexUnified1DArrayU32Float";
575   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
576     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
577   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
578     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
579   case NVPTXISD::TexUnified2DFloatS32:
580     return "NVPTXISD::TexUnified2DFloatS32";
581   case NVPTXISD::TexUnified2DFloatFloat:
582     return "NVPTXISD::TexUnified2DFloatFloat";
583   case NVPTXISD::TexUnified2DFloatFloatLevel:
584     return "NVPTXISD::TexUnified2DFloatFloatLevel";
585   case NVPTXISD::TexUnified2DFloatFloatGrad:
586     return "NVPTXISD::TexUnified2DFloatFloatGrad";
587   case NVPTXISD::TexUnified2DS32S32:
588     return "NVPTXISD::TexUnified2DS32S32";
589   case NVPTXISD::TexUnified2DS32Float:
590     return "NVPTXISD::TexUnified2DS32Float";
591   case NVPTXISD::TexUnified2DS32FloatLevel:
592     return "NVPTXISD::TexUnified2DS32FloatLevel";
593   case NVPTXISD::TexUnified2DS32FloatGrad:
594     return "NVPTXISD::TexUnified2DS32FloatGrad";
595   case NVPTXISD::TexUnified2DU32S32:
596     return "NVPTXISD::TexUnified2DU32S32";
597   case NVPTXISD::TexUnified2DU32Float:
598     return "NVPTXISD::TexUnified2DU32Float";
599   case NVPTXISD::TexUnified2DU32FloatLevel:
600     return "NVPTXISD::TexUnified2DU32FloatLevel";
601   case NVPTXISD::TexUnified2DU32FloatGrad:
602     return "NVPTXISD::TexUnified2DU32FloatGrad";
603   case NVPTXISD::TexUnified2DArrayFloatS32:
604     return "NVPTXISD::TexUnified2DArrayFloatS32";
605   case NVPTXISD::TexUnified2DArrayFloatFloat:
606     return "NVPTXISD::TexUnified2DArrayFloatFloat";
607   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
608     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
609   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
610     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
611   case NVPTXISD::TexUnified2DArrayS32S32:
612     return "NVPTXISD::TexUnified2DArrayS32S32";
613   case NVPTXISD::TexUnified2DArrayS32Float:
614     return "NVPTXISD::TexUnified2DArrayS32Float";
615   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
616     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
617   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
618     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
619   case NVPTXISD::TexUnified2DArrayU32S32:
620     return "NVPTXISD::TexUnified2DArrayU32S32";
621   case NVPTXISD::TexUnified2DArrayU32Float:
622     return "NVPTXISD::TexUnified2DArrayU32Float";
623   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
624     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
625   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
626     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
627   case NVPTXISD::TexUnified3DFloatS32:
628     return "NVPTXISD::TexUnified3DFloatS32";
629   case NVPTXISD::TexUnified3DFloatFloat:
630     return "NVPTXISD::TexUnified3DFloatFloat";
631   case NVPTXISD::TexUnified3DFloatFloatLevel:
632     return "NVPTXISD::TexUnified3DFloatFloatLevel";
633   case NVPTXISD::TexUnified3DFloatFloatGrad:
634     return "NVPTXISD::TexUnified3DFloatFloatGrad";
635   case NVPTXISD::TexUnified3DS32S32:
636     return "NVPTXISD::TexUnified3DS32S32";
637   case NVPTXISD::TexUnified3DS32Float:
638     return "NVPTXISD::TexUnified3DS32Float";
639   case NVPTXISD::TexUnified3DS32FloatLevel:
640     return "NVPTXISD::TexUnified3DS32FloatLevel";
641   case NVPTXISD::TexUnified3DS32FloatGrad:
642     return "NVPTXISD::TexUnified3DS32FloatGrad";
643   case NVPTXISD::TexUnified3DU32S32:
644     return "NVPTXISD::TexUnified3DU32S32";
645   case NVPTXISD::TexUnified3DU32Float:
646     return "NVPTXISD::TexUnified3DU32Float";
647   case NVPTXISD::TexUnified3DU32FloatLevel:
648     return "NVPTXISD::TexUnified3DU32FloatLevel";
649   case NVPTXISD::TexUnified3DU32FloatGrad:
650     return "NVPTXISD::TexUnified3DU32FloatGrad";
651   case NVPTXISD::TexUnifiedCubeFloatFloat:
652     return "NVPTXISD::TexUnifiedCubeFloatFloat";
653   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
654     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
655   case NVPTXISD::TexUnifiedCubeS32Float:
656     return "NVPTXISD::TexUnifiedCubeS32Float";
657   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
658     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
659   case NVPTXISD::TexUnifiedCubeU32Float:
660     return "NVPTXISD::TexUnifiedCubeU32Float";
661   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
662     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
663   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
664     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
665   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
666     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
667   case NVPTXISD::TexUnifiedCubeArrayS32Float:
668     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
669   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
670     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
671   case NVPTXISD::TexUnifiedCubeArrayU32Float:
672     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
673   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
674     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
675   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
676     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
677   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
678     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
679   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
680     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
681   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
682     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
683   case NVPTXISD::Tld4UnifiedR2DS64Float:
684     return "NVPTXISD::Tld4UnifiedR2DS64Float";
685   case NVPTXISD::Tld4UnifiedG2DS64Float:
686     return "NVPTXISD::Tld4UnifiedG2DS64Float";
687   case NVPTXISD::Tld4UnifiedB2DS64Float:
688     return "NVPTXISD::Tld4UnifiedB2DS64Float";
689   case NVPTXISD::Tld4UnifiedA2DS64Float:
690     return "NVPTXISD::Tld4UnifiedA2DS64Float";
691   case NVPTXISD::Tld4UnifiedR2DU64Float:
692     return "NVPTXISD::Tld4UnifiedR2DU64Float";
693   case NVPTXISD::Tld4UnifiedG2DU64Float:
694     return "NVPTXISD::Tld4UnifiedG2DU64Float";
695   case NVPTXISD::Tld4UnifiedB2DU64Float:
696     return "NVPTXISD::Tld4UnifiedB2DU64Float";
697   case NVPTXISD::Tld4UnifiedA2DU64Float:
698     return "NVPTXISD::Tld4UnifiedA2DU64Float";
699 
700   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
701   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
702   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
703   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
704   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
705   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
706   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
707   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
708   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
709   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
710   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
711 
712   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
713   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
714   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
715   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
716   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
717   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
718   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
719   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
720   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
721   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
722   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
723 
724   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
725   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
726   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
727   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
728   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
729   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
730   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
731   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
732   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
733   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
734   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
735 
736   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
737   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
738   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
739   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
740   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
741   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
742   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
743   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
744   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
745   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
746   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
747 
748   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
749   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
750   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
751   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
752   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
753   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
754   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
755   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
756   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
757   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
758   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
759 
760   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
761   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
762   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
763   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
764   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
765   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
766   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
767   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
768   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
769   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
770   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
771 
772   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
773   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
774   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
775   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
776   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
777   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
778   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
779   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
780   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
781   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
782   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
783 
784   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
785   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
786   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
787   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
788   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
789   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
790   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
791   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
792   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
793   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
794   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
795 
796   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
797   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
798   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
799   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
800   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
801   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
802   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
803   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
804   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
805   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
806   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
807 
808   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
809   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
810   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
811   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
812   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
813   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
814   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
815   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
816   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
817   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
818   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
819 
820   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
821   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
822   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
823   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
824   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
825   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
826   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
827   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
828   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
829   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
830   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
831 
832   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
833   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
834   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
835   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
836   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
837   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
838   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
839   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
840   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
841   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
842   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
843 
844   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
845   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
846   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
847   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
848   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
849   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
850   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
851   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
852   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
853   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
854   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
855 
856   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
857   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
858   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
859   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
860   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
861   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
862   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
863   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
864   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
865   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
866   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
867 
868   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
869   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
870   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
871   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
872   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
873   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
874   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
875   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
876   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
877   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
878   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
879   }
880   return nullptr;
881 }
882 
883 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const884 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
885   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
886     return TypeSplitVector;
887 
888   return TargetLoweringBase::getPreferredVectorAction(VT);
889 }
890 
891 SDValue
LowerGlobalAddress(SDValue Op,SelectionDAG & DAG) const892 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
893   SDLoc dl(Op);
894   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
895   auto PtrVT = getPointerTy(DAG.getDataLayout());
896   Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
897   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
898 }
899 
getPrototype(const DataLayout & DL,Type * retTy,const ArgListTy & Args,const SmallVectorImpl<ISD::OutputArg> & Outs,unsigned retAlignment,const ImmutableCallSite * CS) const900 std::string NVPTXTargetLowering::getPrototype(
901     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
902     const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
903     const ImmutableCallSite *CS) const {
904   auto PtrVT = getPointerTy(DL);
905 
906   bool isABI = (STI.getSmVersion() >= 20);
907   assert(isABI && "Non-ABI compilation is not supported");
908   if (!isABI)
909     return "";
910 
911   std::stringstream O;
912   O << "prototype_" << uniqueCallSite << " : .callprototype ";
913 
914   if (retTy->getTypeID() == Type::VoidTyID) {
915     O << "()";
916   } else {
917     O << "(";
918     if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
919       unsigned size = 0;
920       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
921         size = ITy->getBitWidth();
922         if (size < 32)
923           size = 32;
924       } else {
925         assert(retTy->isFloatingPointTy() &&
926                "Floating point type expected here");
927         size = retTy->getPrimitiveSizeInBits();
928       }
929 
930       O << ".param .b" << size << " _";
931     } else if (isa<PointerType>(retTy)) {
932       O << ".param .b" << PtrVT.getSizeInBits() << " _";
933     } else if ((retTy->getTypeID() == Type::StructTyID) ||
934                isa<VectorType>(retTy)) {
935       auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
936       O << ".param .align " << retAlignment << " .b8 _["
937         << DL.getTypeAllocSize(retTy) << "]";
938     } else {
939       llvm_unreachable("Unknown return type");
940     }
941     O << ") ";
942   }
943   O << "_ (";
944 
945   bool first = true;
946 
947   unsigned OIdx = 0;
948   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
949     Type *Ty = Args[i].Ty;
950     if (!first) {
951       O << ", ";
952     }
953     first = false;
954 
955     if (!Outs[OIdx].Flags.isByVal()) {
956       if (Ty->isAggregateType() || Ty->isVectorTy()) {
957         unsigned align = 0;
958         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
959         // +1 because index 0 is reserved for return type alignment
960         if (!llvm::getAlign(*CallI, i + 1, align))
961           align = DL.getABITypeAlignment(Ty);
962         unsigned sz = DL.getTypeAllocSize(Ty);
963         O << ".param .align " << align << " .b8 ";
964         O << "_";
965         O << "[" << sz << "]";
966         // update the index for Outs
967         SmallVector<EVT, 16> vtparts;
968         ComputeValueVTs(*this, DL, Ty, vtparts);
969         if (unsigned len = vtparts.size())
970           OIdx += len - 1;
971         continue;
972       }
973        // i8 types in IR will be i16 types in SDAG
974       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
975               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
976              "type mismatch between callee prototype and arguments");
977       // scalar type
978       unsigned sz = 0;
979       if (isa<IntegerType>(Ty)) {
980         sz = cast<IntegerType>(Ty)->getBitWidth();
981         if (sz < 32)
982           sz = 32;
983       } else if (isa<PointerType>(Ty))
984         sz = PtrVT.getSizeInBits();
985       else
986         sz = Ty->getPrimitiveSizeInBits();
987       O << ".param .b" << sz << " ";
988       O << "_";
989       continue;
990     }
991     auto *PTy = dyn_cast<PointerType>(Ty);
992     assert(PTy && "Param with byval attribute should be a pointer type");
993     Type *ETy = PTy->getElementType();
994 
995     unsigned align = Outs[OIdx].Flags.getByValAlign();
996     unsigned sz = DL.getTypeAllocSize(ETy);
997     O << ".param .align " << align << " .b8 ";
998     O << "_";
999     O << "[" << sz << "]";
1000   }
1001   O << ");";
1002   return O.str();
1003 }
1004 
1005 unsigned
getArgumentAlignment(SDValue Callee,const ImmutableCallSite * CS,Type * Ty,unsigned Idx) const1006 NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1007                                           const ImmutableCallSite *CS,
1008                                           Type *Ty,
1009                                           unsigned Idx) const {
1010   unsigned Align = 0;
1011   const Value *DirectCallee = CS->getCalledFunction();
1012 
1013   if (!DirectCallee) {
1014     // We don't have a direct function symbol, but that may be because of
1015     // constant cast instructions in the call.
1016     const Instruction *CalleeI = CS->getInstruction();
1017     assert(CalleeI && "Call target is not a function or derived value?");
1018 
1019     // With bitcast'd call targets, the instruction will be the call
1020     if (isa<CallInst>(CalleeI)) {
1021       // Check if we have call alignment metadata
1022       if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1023         return Align;
1024 
1025       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1026       // Ignore any bitcast instructions
1027       while(isa<ConstantExpr>(CalleeV)) {
1028         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1029         if (!CE->isCast())
1030           break;
1031         // Look through the bitcast
1032         CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1033       }
1034 
1035       // We have now looked past all of the bitcasts.  Do we finally have a
1036       // Function?
1037       if (isa<Function>(CalleeV))
1038         DirectCallee = CalleeV;
1039     }
1040   }
1041 
1042   // Check for function alignment information if we found that the
1043   // ultimate target is a Function
1044   if (DirectCallee)
1045     if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
1046       return Align;
1047 
1048   // Call is indirect or alignment information is not available, fall back to
1049   // the ABI type alignment
1050   auto &DL = CS->getCaller()->getParent()->getDataLayout();
1051   return DL.getABITypeAlignment(Ty);
1052 }
1053 
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const1054 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1055                                        SmallVectorImpl<SDValue> &InVals) const {
1056   SelectionDAG &DAG = CLI.DAG;
1057   SDLoc dl = CLI.DL;
1058   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1059   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1060   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1061   SDValue Chain = CLI.Chain;
1062   SDValue Callee = CLI.Callee;
1063   bool &isTailCall = CLI.IsTailCall;
1064   ArgListTy &Args = CLI.getArgs();
1065   Type *retTy = CLI.RetTy;
1066   ImmutableCallSite *CS = CLI.CS;
1067 
1068   bool isABI = (STI.getSmVersion() >= 20);
1069   assert(isABI && "Non-ABI compilation is not supported");
1070   if (!isABI)
1071     return Chain;
1072   MachineFunction &MF = DAG.getMachineFunction();
1073   const Function *F = MF.getFunction();
1074   auto &DL = MF.getDataLayout();
1075 
1076   SDValue tempChain = Chain;
1077   Chain = DAG.getCALLSEQ_START(Chain,
1078                                DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1079                                dl);
1080   SDValue InFlag = Chain.getValue(1);
1081 
1082   unsigned paramCount = 0;
1083   // Args.size() and Outs.size() need not match.
1084   // Outs.size() will be larger
1085   //   * if there is an aggregate argument with multiple fields (each field
1086   //     showing up separately in Outs)
1087   //   * if there is a vector argument with more than typical vector-length
1088   //     elements (generally if more than 4) where each vector element is
1089   //     individually present in Outs.
1090   // So a different index should be used for indexing into Outs/OutVals.
1091   // See similar issue in LowerFormalArguments.
1092   unsigned OIdx = 0;
1093   // Declare the .params or .reg need to pass values
1094   // to the function
1095   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1096     EVT VT = Outs[OIdx].VT;
1097     Type *Ty = Args[i].Ty;
1098 
1099     if (!Outs[OIdx].Flags.isByVal()) {
1100       if (Ty->isAggregateType()) {
1101         // aggregate
1102         SmallVector<EVT, 16> vtparts;
1103         SmallVector<uint64_t, 16> Offsets;
1104         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
1105                            0);
1106 
1107         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
1108         // declare .param .align <align> .b8 .param<n>[<size>];
1109         unsigned sz = DL.getTypeAllocSize(Ty);
1110         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1111         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
1112                                                              MVT::i32),
1113                                       DAG.getConstant(paramCount, dl, MVT::i32),
1114                                       DAG.getConstant(sz, dl, MVT::i32),
1115                                       InFlag };
1116         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1117                             DeclareParamOps);
1118         InFlag = Chain.getValue(1);
1119         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
1120           EVT elemtype = vtparts[j];
1121           unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
1122           if (elemtype.isInteger() && (sz < 8))
1123             sz = 8;
1124           SDValue StVal = OutVals[OIdx];
1125           if (elemtype.getSizeInBits() < 16) {
1126             StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1127           }
1128           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1129           SDValue CopyParamOps[] = { Chain,
1130                                      DAG.getConstant(paramCount, dl, MVT::i32),
1131                                      DAG.getConstant(Offsets[j], dl, MVT::i32),
1132                                      StVal, InFlag };
1133           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
1134                                           CopyParamVTs, CopyParamOps,
1135                                           elemtype, MachinePointerInfo(),
1136                                           ArgAlign);
1137           InFlag = Chain.getValue(1);
1138           ++OIdx;
1139         }
1140         if (vtparts.size() > 0)
1141           --OIdx;
1142         ++paramCount;
1143         continue;
1144       }
1145       if (Ty->isVectorTy()) {
1146         EVT ObjectVT = getValueType(DL, Ty);
1147         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
1148         // declare .param .align <align> .b8 .param<n>[<size>];
1149         unsigned sz = DL.getTypeAllocSize(Ty);
1150         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1151         SDValue DeclareParamOps[] = { Chain,
1152                                       DAG.getConstant(align, dl, MVT::i32),
1153                                       DAG.getConstant(paramCount, dl, MVT::i32),
1154                                       DAG.getConstant(sz, dl, MVT::i32),
1155                                       InFlag };
1156         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1157                             DeclareParamOps);
1158         InFlag = Chain.getValue(1);
1159         unsigned NumElts = ObjectVT.getVectorNumElements();
1160         EVT EltVT = ObjectVT.getVectorElementType();
1161         EVT MemVT = EltVT;
1162         bool NeedExtend = false;
1163         if (EltVT.getSizeInBits() < 16) {
1164           NeedExtend = true;
1165           EltVT = MVT::i16;
1166         }
1167 
1168         // V1 store
1169         if (NumElts == 1) {
1170           SDValue Elt = OutVals[OIdx++];
1171           if (NeedExtend)
1172             Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
1173 
1174           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1175           SDValue CopyParamOps[] = { Chain,
1176                                      DAG.getConstant(paramCount, dl, MVT::i32),
1177                                      DAG.getConstant(0, dl, MVT::i32), Elt,
1178                                      InFlag };
1179           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
1180                                           CopyParamVTs, CopyParamOps,
1181                                           MemVT, MachinePointerInfo());
1182           InFlag = Chain.getValue(1);
1183         } else if (NumElts == 2) {
1184           SDValue Elt0 = OutVals[OIdx++];
1185           SDValue Elt1 = OutVals[OIdx++];
1186           if (NeedExtend) {
1187             Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
1188             Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
1189           }
1190 
1191           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1192           SDValue CopyParamOps[] = { Chain,
1193                                      DAG.getConstant(paramCount, dl, MVT::i32),
1194                                      DAG.getConstant(0, dl, MVT::i32), Elt0,
1195                                      Elt1, InFlag };
1196           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
1197                                           CopyParamVTs, CopyParamOps,
1198                                           MemVT, MachinePointerInfo());
1199           InFlag = Chain.getValue(1);
1200         } else {
1201           unsigned curOffset = 0;
1202           // V4 stores
1203           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
1204           // the
1205           // vector will be expanded to a power of 2 elements, so we know we can
1206           // always round up to the next multiple of 4 when creating the vector
1207           // stores.
1208           // e.g.  4 elem => 1 st.v4
1209           //       6 elem => 2 st.v4
1210           //       8 elem => 2 st.v4
1211           //      11 elem => 3 st.v4
1212           unsigned VecSize = 4;
1213           if (EltVT.getSizeInBits() == 64)
1214             VecSize = 2;
1215 
1216           // This is potentially only part of a vector, so assume all elements
1217           // are packed together.
1218           unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
1219 
1220           for (unsigned i = 0; i < NumElts; i += VecSize) {
1221             // Get values
1222             SDValue StoreVal;
1223             SmallVector<SDValue, 8> Ops;
1224             Ops.push_back(Chain);
1225             Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1226             Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
1227 
1228             unsigned Opc = NVPTXISD::StoreParamV2;
1229 
1230             StoreVal = OutVals[OIdx++];
1231             if (NeedExtend)
1232               StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1233             Ops.push_back(StoreVal);
1234 
1235             if (i + 1 < NumElts) {
1236               StoreVal = OutVals[OIdx++];
1237               if (NeedExtend)
1238                 StoreVal =
1239                     DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1240             } else {
1241               StoreVal = DAG.getUNDEF(EltVT);
1242             }
1243             Ops.push_back(StoreVal);
1244 
1245             if (VecSize == 4) {
1246               Opc = NVPTXISD::StoreParamV4;
1247               if (i + 2 < NumElts) {
1248                 StoreVal = OutVals[OIdx++];
1249                 if (NeedExtend)
1250                   StoreVal =
1251                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1252               } else {
1253                 StoreVal = DAG.getUNDEF(EltVT);
1254               }
1255               Ops.push_back(StoreVal);
1256 
1257               if (i + 3 < NumElts) {
1258                 StoreVal = OutVals[OIdx++];
1259                 if (NeedExtend)
1260                   StoreVal =
1261                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1262               } else {
1263                 StoreVal = DAG.getUNDEF(EltVT);
1264               }
1265               Ops.push_back(StoreVal);
1266             }
1267 
1268             Ops.push_back(InFlag);
1269 
1270             SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1271             Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
1272                                             MemVT, MachinePointerInfo());
1273             InFlag = Chain.getValue(1);
1274             curOffset += PerStoreOffset;
1275           }
1276         }
1277         ++paramCount;
1278         --OIdx;
1279         continue;
1280       }
1281       // Plain scalar
1282       // for ABI,    declare .param .b<size> .param<n>;
1283       unsigned sz = VT.getSizeInBits();
1284       bool needExtend = false;
1285       if (VT.isInteger()) {
1286         if (sz < 16)
1287           needExtend = true;
1288         if (sz < 32)
1289           sz = 32;
1290       }
1291       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1292       SDValue DeclareParamOps[] = { Chain,
1293                                     DAG.getConstant(paramCount, dl, MVT::i32),
1294                                     DAG.getConstant(sz, dl, MVT::i32),
1295                                     DAG.getConstant(0, dl, MVT::i32), InFlag };
1296       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1297                           DeclareParamOps);
1298       InFlag = Chain.getValue(1);
1299       SDValue OutV = OutVals[OIdx];
1300       if (needExtend) {
1301         // zext/sext i1 to i16
1302         unsigned opc = ISD::ZERO_EXTEND;
1303         if (Outs[OIdx].Flags.isSExt())
1304           opc = ISD::SIGN_EXTEND;
1305         OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
1306       }
1307       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1308       SDValue CopyParamOps[] = { Chain,
1309                                  DAG.getConstant(paramCount, dl, MVT::i32),
1310                                  DAG.getConstant(0, dl, MVT::i32), OutV,
1311                                  InFlag };
1312 
1313       unsigned opcode = NVPTXISD::StoreParam;
1314       if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
1315         opcode = NVPTXISD::StoreParamU32;
1316       else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
1317         opcode = NVPTXISD::StoreParamS32;
1318       Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
1319                                       VT, MachinePointerInfo());
1320 
1321       InFlag = Chain.getValue(1);
1322       ++paramCount;
1323       continue;
1324     }
1325     // struct or vector
1326     SmallVector<EVT, 16> vtparts;
1327     SmallVector<uint64_t, 16> Offsets;
1328     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1329     assert(PTy && "Type of a byval parameter should be pointer");
1330     ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
1331                        vtparts, &Offsets, 0);
1332 
1333     // declare .param .align <align> .b8 .param<n>[<size>];
1334     unsigned sz = Outs[OIdx].Flags.getByValSize();
1335     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1336     unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1337     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1338     // so we don't need to worry about natural alignment or not.
1339     // See TargetLowering::LowerCallTo().
1340     SDValue DeclareParamOps[] = {
1341       Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), dl, MVT::i32),
1342       DAG.getConstant(paramCount, dl, MVT::i32),
1343       DAG.getConstant(sz, dl, MVT::i32), InFlag
1344     };
1345     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1346                         DeclareParamOps);
1347     InFlag = Chain.getValue(1);
1348     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
1349       EVT elemtype = vtparts[j];
1350       int curOffset = Offsets[j];
1351       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1352       auto PtrVT = getPointerTy(DAG.getDataLayout());
1353       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1354                                     DAG.getConstant(curOffset, dl, PtrVT));
1355       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1356                                    MachinePointerInfo(), false, false, false,
1357                                    PartAlign);
1358       if (elemtype.getSizeInBits() < 16) {
1359         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1360       }
1361       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1362       SDValue CopyParamOps[] = { Chain,
1363                                  DAG.getConstant(paramCount, dl, MVT::i32),
1364                                  DAG.getConstant(curOffset, dl, MVT::i32),
1365                                  theVal, InFlag };
1366       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1367                                       CopyParamOps, elemtype,
1368                                       MachinePointerInfo());
1369 
1370       InFlag = Chain.getValue(1);
1371     }
1372     ++paramCount;
1373   }
1374 
1375   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1376   unsigned retAlignment = 0;
1377 
1378   // Handle Result
1379   if (Ins.size() > 0) {
1380     SmallVector<EVT, 16> resvtparts;
1381     ComputeValueVTs(*this, DL, retTy, resvtparts);
1382 
1383     // Declare
1384     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
1385     //  .param .b<size-in-bits> retval0
1386     unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
1387     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1388     // these three types to match the logic in
1389     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1390     // Plus, this behavior is consistent with nvcc's.
1391     if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
1392         retTy->isPointerTy()) {
1393       // Scalar needs to be at least 32bit wide
1394       if (resultsz < 32)
1395         resultsz = 32;
1396       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1397       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1398                                   DAG.getConstant(resultsz, dl, MVT::i32),
1399                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
1400       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1401                           DeclareRetOps);
1402       InFlag = Chain.getValue(1);
1403     } else {
1404       retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
1405       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1406       SDValue DeclareRetOps[] = { Chain,
1407                                   DAG.getConstant(retAlignment, dl, MVT::i32),
1408                                   DAG.getConstant(resultsz / 8, dl, MVT::i32),
1409                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
1410       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1411                           DeclareRetOps);
1412       InFlag = Chain.getValue(1);
1413     }
1414   }
1415 
1416   if (!Func) {
1417     // This is indirect function call case : PTX requires a prototype of the
1418     // form
1419     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1420     // to be emitted, and the label has to used as the last arg of call
1421     // instruction.
1422     // The prototype is embedded in a string and put as the operand for a
1423     // CallPrototype SDNode which will print out to the value of the string.
1424     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1425     std::string Proto =
1426         getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
1427     const char *ProtoStr =
1428       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1429     SDValue ProtoOps[] = {
1430       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1431     };
1432     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1433     InFlag = Chain.getValue(1);
1434   }
1435   // Op to just print "call"
1436   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1437   SDValue PrintCallOps[] = {
1438     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1439   };
1440   // We model convergent calls as separate opcodes.
1441   unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
1442   if (CLI.IsConvergent)
1443     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
1444                                               : NVPTXISD::PrintConvergentCall;
1445   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1446   InFlag = Chain.getValue(1);
1447 
1448   // Ops to print out the function name
1449   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1450   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1451   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1452   InFlag = Chain.getValue(1);
1453 
1454   // Ops to print out the param list
1455   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1456   SDValue CallArgBeginOps[] = { Chain, InFlag };
1457   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1458                       CallArgBeginOps);
1459   InFlag = Chain.getValue(1);
1460 
1461   for (unsigned i = 0, e = paramCount; i != e; ++i) {
1462     unsigned opcode;
1463     if (i == (e - 1))
1464       opcode = NVPTXISD::LastCallArg;
1465     else
1466       opcode = NVPTXISD::CallArg;
1467     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1468     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1469                              DAG.getConstant(i, dl, MVT::i32), InFlag };
1470     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1471     InFlag = Chain.getValue(1);
1472   }
1473   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1474   SDValue CallArgEndOps[] = { Chain,
1475                               DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
1476                               InFlag };
1477   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1478   InFlag = Chain.getValue(1);
1479 
1480   if (!Func) {
1481     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1482     SDValue PrototypeOps[] = { Chain,
1483                                DAG.getConstant(uniqueCallSite, dl, MVT::i32),
1484                                InFlag };
1485     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1486     InFlag = Chain.getValue(1);
1487   }
1488 
1489   // Generate loads from param memory/moves from registers for result
1490   if (Ins.size() > 0) {
1491     if (retTy && retTy->isVectorTy()) {
1492       EVT ObjectVT = getValueType(DL, retTy);
1493       unsigned NumElts = ObjectVT.getVectorNumElements();
1494       EVT EltVT = ObjectVT.getVectorElementType();
1495       assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
1496                                                       ObjectVT) == NumElts &&
1497              "Vector was not scalarized");
1498       unsigned sz = EltVT.getSizeInBits();
1499       bool needTruncate = sz < 8;
1500 
1501       if (NumElts == 1) {
1502         // Just a simple load
1503         SmallVector<EVT, 4> LoadRetVTs;
1504         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1505           // If loading i1/i8 result, generate
1506           //   load.b8 i16
1507           //   if i1
1508           //   trunc i16 to i1
1509           LoadRetVTs.push_back(MVT::i16);
1510         } else
1511           LoadRetVTs.push_back(EltVT);
1512         LoadRetVTs.push_back(MVT::Other);
1513         LoadRetVTs.push_back(MVT::Glue);
1514         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1515                                 DAG.getConstant(0, dl, MVT::i32), InFlag};
1516         SDValue retval = DAG.getMemIntrinsicNode(
1517             NVPTXISD::LoadParam, dl,
1518             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
1519         Chain = retval.getValue(1);
1520         InFlag = retval.getValue(2);
1521         SDValue Ret0 = retval;
1522         if (needTruncate)
1523           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
1524         InVals.push_back(Ret0);
1525       } else if (NumElts == 2) {
1526         // LoadV2
1527         SmallVector<EVT, 4> LoadRetVTs;
1528         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1529           // If loading i1/i8 result, generate
1530           //   load.b8 i16
1531           //   if i1
1532           //   trunc i16 to i1
1533           LoadRetVTs.push_back(MVT::i16);
1534           LoadRetVTs.push_back(MVT::i16);
1535         } else {
1536           LoadRetVTs.push_back(EltVT);
1537           LoadRetVTs.push_back(EltVT);
1538         }
1539         LoadRetVTs.push_back(MVT::Other);
1540         LoadRetVTs.push_back(MVT::Glue);
1541         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1542                                 DAG.getConstant(0, dl, MVT::i32), InFlag};
1543         SDValue retval = DAG.getMemIntrinsicNode(
1544             NVPTXISD::LoadParamV2, dl,
1545             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
1546         Chain = retval.getValue(2);
1547         InFlag = retval.getValue(3);
1548         SDValue Ret0 = retval.getValue(0);
1549         SDValue Ret1 = retval.getValue(1);
1550         if (needTruncate) {
1551           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
1552           InVals.push_back(Ret0);
1553           Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
1554           InVals.push_back(Ret1);
1555         } else {
1556           InVals.push_back(Ret0);
1557           InVals.push_back(Ret1);
1558         }
1559       } else {
1560         // Split into N LoadV4
1561         unsigned Ofst = 0;
1562         unsigned VecSize = 4;
1563         unsigned Opc = NVPTXISD::LoadParamV4;
1564         if (EltVT.getSizeInBits() == 64) {
1565           VecSize = 2;
1566           Opc = NVPTXISD::LoadParamV2;
1567         }
1568         EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
1569         for (unsigned i = 0; i < NumElts; i += VecSize) {
1570           SmallVector<EVT, 8> LoadRetVTs;
1571           if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1572             // If loading i1/i8 result, generate
1573             //   load.b8 i16
1574             //   if i1
1575             //   trunc i16 to i1
1576             for (unsigned j = 0; j < VecSize; ++j)
1577               LoadRetVTs.push_back(MVT::i16);
1578           } else {
1579             for (unsigned j = 0; j < VecSize; ++j)
1580               LoadRetVTs.push_back(EltVT);
1581           }
1582           LoadRetVTs.push_back(MVT::Other);
1583           LoadRetVTs.push_back(MVT::Glue);
1584           SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1585                                   DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
1586           SDValue retval = DAG.getMemIntrinsicNode(
1587               Opc, dl, DAG.getVTList(LoadRetVTs),
1588               LoadRetOps, EltVT, MachinePointerInfo());
1589           if (VecSize == 2) {
1590             Chain = retval.getValue(2);
1591             InFlag = retval.getValue(3);
1592           } else {
1593             Chain = retval.getValue(4);
1594             InFlag = retval.getValue(5);
1595           }
1596 
1597           for (unsigned j = 0; j < VecSize; ++j) {
1598             if (i + j >= NumElts)
1599               break;
1600             SDValue Elt = retval.getValue(j);
1601             if (needTruncate)
1602               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
1603             InVals.push_back(Elt);
1604           }
1605           Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
1606         }
1607       }
1608     } else {
1609       SmallVector<EVT, 16> VTs;
1610       SmallVector<uint64_t, 16> Offsets;
1611       ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
1612       assert(VTs.size() == Ins.size() && "Bad value decomposition");
1613       unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
1614       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
1615         unsigned sz = VTs[i].getSizeInBits();
1616         unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1617         bool needTruncate = false;
1618         if (VTs[i].isInteger() && sz < 8) {
1619           sz = 8;
1620           needTruncate = true;
1621         }
1622 
1623         SmallVector<EVT, 4> LoadRetVTs;
1624         EVT TheLoadType = VTs[i];
1625         if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
1626           // This is for integer types only, and specifically not for
1627           // aggregates.
1628           LoadRetVTs.push_back(MVT::i32);
1629           TheLoadType = MVT::i32;
1630           needTruncate = true;
1631         } else if (sz < 16) {
1632           // If loading i1/i8 result, generate
1633           //   load i8 (-> i16)
1634           //   trunc i16 to i1/i8
1635 
1636           // FIXME: Do we need to set needTruncate to true here, too?  We could
1637           // not figure out what this branch is for in D17872, so we left it
1638           // alone.  The comment above about loading i1/i8 may be wrong, as the
1639           // branch above seems to cover integers of size < 32.
1640           LoadRetVTs.push_back(MVT::i16);
1641         } else
1642           LoadRetVTs.push_back(Ins[i].VT);
1643         LoadRetVTs.push_back(MVT::Other);
1644         LoadRetVTs.push_back(MVT::Glue);
1645 
1646         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1647                                 DAG.getConstant(Offsets[i], dl, MVT::i32),
1648                                 InFlag};
1649         SDValue retval = DAG.getMemIntrinsicNode(
1650             NVPTXISD::LoadParam, dl,
1651             DAG.getVTList(LoadRetVTs), LoadRetOps,
1652             TheLoadType, MachinePointerInfo(), AlignI);
1653         Chain = retval.getValue(1);
1654         InFlag = retval.getValue(2);
1655         SDValue Ret0 = retval.getValue(0);
1656         if (needTruncate)
1657           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
1658         InVals.push_back(Ret0);
1659       }
1660     }
1661   }
1662 
1663   Chain = DAG.getCALLSEQ_END(Chain,
1664                              DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1665                              DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1666                                                    true),
1667                              InFlag, dl);
1668   uniqueCallSite++;
1669 
1670   // set isTailCall to false for now, until we figure out how to express
1671   // tail call optimization in PTX
1672   isTailCall = false;
1673   return Chain;
1674 }
1675 
1676 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1677 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1678 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1679 SDValue
LowerCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG) const1680 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1681   SDNode *Node = Op.getNode();
1682   SDLoc dl(Node);
1683   SmallVector<SDValue, 8> Ops;
1684   unsigned NumOperands = Node->getNumOperands();
1685   for (unsigned i = 0; i < NumOperands; ++i) {
1686     SDValue SubOp = Node->getOperand(i);
1687     EVT VVT = SubOp.getNode()->getValueType(0);
1688     EVT EltVT = VVT.getVectorElementType();
1689     unsigned NumSubElem = VVT.getVectorNumElements();
1690     for (unsigned j = 0; j < NumSubElem; ++j) {
1691       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1692                                 DAG.getIntPtrConstant(j, dl)));
1693     }
1694   }
1695   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1696 }
1697 
1698 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1699 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1700 ///    amount, or
1701 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1702 ///    amount.
LowerShiftRightParts(SDValue Op,SelectionDAG & DAG) const1703 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1704                                                   SelectionDAG &DAG) const {
1705   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1706   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1707 
1708   EVT VT = Op.getValueType();
1709   unsigned VTBits = VT.getSizeInBits();
1710   SDLoc dl(Op);
1711   SDValue ShOpLo = Op.getOperand(0);
1712   SDValue ShOpHi = Op.getOperand(1);
1713   SDValue ShAmt  = Op.getOperand(2);
1714   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1715 
1716   if (VTBits == 32 && STI.getSmVersion() >= 35) {
1717 
1718     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1719     // {dHi, dLo} = {aHi, aLo} >> Amt
1720     //   dHi = aHi >> Amt
1721     //   dLo = shf.r.clamp aLo, aHi, Amt
1722 
1723     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1724     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1725                              ShAmt);
1726 
1727     SDValue Ops[2] = { Lo, Hi };
1728     return DAG.getMergeValues(Ops, dl);
1729   }
1730   else {
1731 
1732     // {dHi, dLo} = {aHi, aLo} >> Amt
1733     // - if (Amt>=size) then
1734     //      dLo = aHi >> (Amt-size)
1735     //      dHi = aHi >> Amt (this is either all 0 or all 1)
1736     //   else
1737     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1738     //      dHi = aHi >> Amt
1739 
1740     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1741                                    DAG.getConstant(VTBits, dl, MVT::i32),
1742                                    ShAmt);
1743     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1744     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1745                                      DAG.getConstant(VTBits, dl, MVT::i32));
1746     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1747     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1748     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1749 
1750     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1751                                DAG.getConstant(VTBits, dl, MVT::i32),
1752                                ISD::SETGE);
1753     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1754     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1755 
1756     SDValue Ops[2] = { Lo, Hi };
1757     return DAG.getMergeValues(Ops, dl);
1758   }
1759 }
1760 
1761 /// LowerShiftLeftParts - Lower SHL_PARTS, which
1762 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1763 ///    amount, or
1764 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1765 ///    amount.
LowerShiftLeftParts(SDValue Op,SelectionDAG & DAG) const1766 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
1767                                                  SelectionDAG &DAG) const {
1768   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1769   assert(Op.getOpcode() == ISD::SHL_PARTS);
1770 
1771   EVT VT = Op.getValueType();
1772   unsigned VTBits = VT.getSizeInBits();
1773   SDLoc dl(Op);
1774   SDValue ShOpLo = Op.getOperand(0);
1775   SDValue ShOpHi = Op.getOperand(1);
1776   SDValue ShAmt  = Op.getOperand(2);
1777 
1778   if (VTBits == 32 && STI.getSmVersion() >= 35) {
1779 
1780     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1781     // {dHi, dLo} = {aHi, aLo} << Amt
1782     //   dHi = shf.l.clamp aLo, aHi, Amt
1783     //   dLo = aLo << Amt
1784 
1785     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
1786                              ShAmt);
1787     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
1788 
1789     SDValue Ops[2] = { Lo, Hi };
1790     return DAG.getMergeValues(Ops, dl);
1791   }
1792   else {
1793 
1794     // {dHi, dLo} = {aHi, aLo} << Amt
1795     // - if (Amt>=size) then
1796     //      dLo = aLo << Amt (all 0)
1797     //      dLo = aLo << (Amt-size)
1798     //   else
1799     //      dLo = aLo << Amt
1800     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
1801 
1802     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1803                                    DAG.getConstant(VTBits, dl, MVT::i32),
1804                                    ShAmt);
1805     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
1806     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1807                                      DAG.getConstant(VTBits, dl, MVT::i32));
1808     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
1809     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1810     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
1811 
1812     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1813                                DAG.getConstant(VTBits, dl, MVT::i32),
1814                                ISD::SETGE);
1815     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
1816     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1817 
1818     SDValue Ops[2] = { Lo, Hi };
1819     return DAG.getMergeValues(Ops, dl);
1820   }
1821 }
1822 
1823 SDValue
LowerOperation(SDValue Op,SelectionDAG & DAG) const1824 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1825   switch (Op.getOpcode()) {
1826   case ISD::RETURNADDR:
1827     return SDValue();
1828   case ISD::FRAMEADDR:
1829     return SDValue();
1830   case ISD::GlobalAddress:
1831     return LowerGlobalAddress(Op, DAG);
1832   case ISD::INTRINSIC_W_CHAIN:
1833     return Op;
1834   case ISD::BUILD_VECTOR:
1835   case ISD::EXTRACT_SUBVECTOR:
1836     return Op;
1837   case ISD::CONCAT_VECTORS:
1838     return LowerCONCAT_VECTORS(Op, DAG);
1839   case ISD::STORE:
1840     return LowerSTORE(Op, DAG);
1841   case ISD::LOAD:
1842     return LowerLOAD(Op, DAG);
1843   case ISD::SHL_PARTS:
1844     return LowerShiftLeftParts(Op, DAG);
1845   case ISD::SRA_PARTS:
1846   case ISD::SRL_PARTS:
1847     return LowerShiftRightParts(Op, DAG);
1848   case ISD::SELECT:
1849     return LowerSelect(Op, DAG);
1850   default:
1851     llvm_unreachable("Custom lowering not defined for operation");
1852   }
1853 }
1854 
LowerSelect(SDValue Op,SelectionDAG & DAG) const1855 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
1856   SDValue Op0 = Op->getOperand(0);
1857   SDValue Op1 = Op->getOperand(1);
1858   SDValue Op2 = Op->getOperand(2);
1859   SDLoc DL(Op.getNode());
1860 
1861   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
1862 
1863   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
1864   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
1865   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
1866   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
1867 
1868   return Trunc;
1869 }
1870 
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1871 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1872   if (Op.getValueType() == MVT::i1)
1873     return LowerLOADi1(Op, DAG);
1874   else
1875     return SDValue();
1876 }
1877 
1878 // v = ld i1* addr
1879 //   =>
1880 // v1 = ld i8* addr (-> i16)
1881 // v = trunc i16 to i1
LowerLOADi1(SDValue Op,SelectionDAG & DAG) const1882 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
1883   SDNode *Node = Op.getNode();
1884   LoadSDNode *LD = cast<LoadSDNode>(Node);
1885   SDLoc dl(Node);
1886   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
1887   assert(Node->getValueType(0) == MVT::i1 &&
1888          "Custom lowering for i1 load only");
1889   SDValue newLD =
1890       DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
1891                   LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
1892                   LD->isInvariant(), LD->getAlignment());
1893   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
1894   // The legalizer (the caller) is expecting two values from the legalized
1895   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
1896   // in LegalizeDAG.cpp which also uses MergeValues.
1897   SDValue Ops[] = { result, LD->getChain() };
1898   return DAG.getMergeValues(Ops, dl);
1899 }
1900 
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1901 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1902   EVT ValVT = Op.getOperand(1).getValueType();
1903   if (ValVT == MVT::i1)
1904     return LowerSTOREi1(Op, DAG);
1905   else if (ValVT.isVector())
1906     return LowerSTOREVector(Op, DAG);
1907   else
1908     return SDValue();
1909 }
1910 
1911 SDValue
LowerSTOREVector(SDValue Op,SelectionDAG & DAG) const1912 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
1913   SDNode *N = Op.getNode();
1914   SDValue Val = N->getOperand(1);
1915   SDLoc DL(N);
1916   EVT ValVT = Val.getValueType();
1917 
1918   if (ValVT.isVector()) {
1919     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
1920     // legal.  We can (and should) split that into 2 stores of <2 x double> here
1921     // but I'm leaving that as a TODO for now.
1922     if (!ValVT.isSimple())
1923       return SDValue();
1924     switch (ValVT.getSimpleVT().SimpleTy) {
1925     default:
1926       return SDValue();
1927     case MVT::v2i8:
1928     case MVT::v2i16:
1929     case MVT::v2i32:
1930     case MVT::v2i64:
1931     case MVT::v2f32:
1932     case MVT::v2f64:
1933     case MVT::v4i8:
1934     case MVT::v4i16:
1935     case MVT::v4i32:
1936     case MVT::v4f32:
1937       // This is a "native" vector type
1938       break;
1939     }
1940 
1941     MemSDNode *MemSD = cast<MemSDNode>(N);
1942     const DataLayout &TD = DAG.getDataLayout();
1943 
1944     unsigned Align = MemSD->getAlignment();
1945     unsigned PrefAlign =
1946         TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
1947     if (Align < PrefAlign) {
1948       // This store is not sufficiently aligned, so bail out and let this vector
1949       // store be scalarized.  Note that we may still be able to emit smaller
1950       // vector stores.  For example, if we are storing a <4 x float> with an
1951       // alignment of 8, this check will fail but the legalizer will try again
1952       // with 2 x <2 x float>, which will succeed with an alignment of 8.
1953       return SDValue();
1954     }
1955 
1956     unsigned Opcode = 0;
1957     EVT EltVT = ValVT.getVectorElementType();
1958     unsigned NumElts = ValVT.getVectorNumElements();
1959 
1960     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
1961     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
1962     // stored type to i16 and propagate the "real" type as the memory type.
1963     bool NeedExt = false;
1964     if (EltVT.getSizeInBits() < 16)
1965       NeedExt = true;
1966 
1967     switch (NumElts) {
1968     default:
1969       return SDValue();
1970     case 2:
1971       Opcode = NVPTXISD::StoreV2;
1972       break;
1973     case 4: {
1974       Opcode = NVPTXISD::StoreV4;
1975       break;
1976     }
1977     }
1978 
1979     SmallVector<SDValue, 8> Ops;
1980 
1981     // First is the chain
1982     Ops.push_back(N->getOperand(0));
1983 
1984     // Then the split values
1985     for (unsigned i = 0; i < NumElts; ++i) {
1986       SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
1987                                    DAG.getIntPtrConstant(i, DL));
1988       if (NeedExt)
1989         ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
1990       Ops.push_back(ExtVal);
1991     }
1992 
1993     // Then any remaining arguments
1994     Ops.append(N->op_begin() + 2, N->op_end());
1995 
1996     SDValue NewSt = DAG.getMemIntrinsicNode(
1997         Opcode, DL, DAG.getVTList(MVT::Other), Ops,
1998         MemSD->getMemoryVT(), MemSD->getMemOperand());
1999 
2000     //return DCI.CombineTo(N, NewSt, true);
2001     return NewSt;
2002   }
2003 
2004   return SDValue();
2005 }
2006 
2007 // st i1 v, addr
2008 //    =>
2009 // v1 = zxt v to i16
2010 // st.u8 i16, addr
LowerSTOREi1(SDValue Op,SelectionDAG & DAG) const2011 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2012   SDNode *Node = Op.getNode();
2013   SDLoc dl(Node);
2014   StoreSDNode *ST = cast<StoreSDNode>(Node);
2015   SDValue Tmp1 = ST->getChain();
2016   SDValue Tmp2 = ST->getBasePtr();
2017   SDValue Tmp3 = ST->getValue();
2018   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2019   unsigned Alignment = ST->getAlignment();
2020   bool isVolatile = ST->isVolatile();
2021   bool isNonTemporal = ST->isNonTemporal();
2022   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2023   SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
2024                                      ST->getPointerInfo(), MVT::i8, isNonTemporal,
2025                                      isVolatile, Alignment);
2026   return Result;
2027 }
2028 
2029 SDValue
getParamSymbol(SelectionDAG & DAG,int idx,EVT v) const2030 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2031   std::string ParamSym;
2032   raw_string_ostream ParamStr(ParamSym);
2033 
2034   ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2035   ParamStr.flush();
2036 
2037   std::string *SavedStr =
2038     nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2039   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2040 }
2041 
2042 // Check to see if the kernel argument is image*_t or sampler_t
2043 
isImageOrSamplerVal(const Value * arg,const Module * context)2044 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2045   static const char *const specialTypes[] = { "struct._image2d_t",
2046                                               "struct._image3d_t",
2047                                               "struct._sampler_t" };
2048 
2049   Type *Ty = arg->getType();
2050   auto *PTy = dyn_cast<PointerType>(Ty);
2051 
2052   if (!PTy)
2053     return false;
2054 
2055   if (!context)
2056     return false;
2057 
2058   auto *STy = dyn_cast<StructType>(PTy->getElementType());
2059   if (!STy || STy->isLiteral())
2060     return false;
2061 
2062   return std::find(std::begin(specialTypes), std::end(specialTypes),
2063                    STy->getName()) != std::end(specialTypes);
2064 }
2065 
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2066 SDValue NVPTXTargetLowering::LowerFormalArguments(
2067     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2068     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2069     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2070   MachineFunction &MF = DAG.getMachineFunction();
2071   const DataLayout &DL = DAG.getDataLayout();
2072   auto PtrVT = getPointerTy(DAG.getDataLayout());
2073 
2074   const Function *F = MF.getFunction();
2075   const AttributeSet &PAL = F->getAttributes();
2076   const TargetLowering *TLI = STI.getTargetLowering();
2077 
2078   SDValue Root = DAG.getRoot();
2079   std::vector<SDValue> OutChains;
2080 
2081   bool isKernel = llvm::isKernelFunction(*F);
2082   bool isABI = (STI.getSmVersion() >= 20);
2083   assert(isABI && "Non-ABI compilation is not supported");
2084   if (!isABI)
2085     return Chain;
2086 
2087   std::vector<Type *> argTypes;
2088   std::vector<const Argument *> theArgs;
2089   for (const Argument &I : F->args()) {
2090     theArgs.push_back(&I);
2091     argTypes.push_back(I.getType());
2092   }
2093   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2094   // Ins.size() will be larger
2095   //   * if there is an aggregate argument with multiple fields (each field
2096   //     showing up separately in Ins)
2097   //   * if there is a vector argument with more than typical vector-length
2098   //     elements (generally if more than 4) where each vector element is
2099   //     individually present in Ins.
2100   // So a different index should be used for indexing into Ins.
2101   // See similar issue in LowerCall.
2102   unsigned InsIdx = 0;
2103 
2104   int idx = 0;
2105   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2106     Type *Ty = argTypes[i];
2107 
2108     // If the kernel argument is image*_t or sampler_t, convert it to
2109     // a i32 constant holding the parameter position. This can later
2110     // matched in the AsmPrinter to output the correct mangled name.
2111     if (isImageOrSamplerVal(
2112             theArgs[i],
2113             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2114                                      : nullptr))) {
2115       assert(isKernel && "Only kernels can have image/sampler params");
2116       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2117       continue;
2118     }
2119 
2120     if (theArgs[i]->use_empty()) {
2121       // argument is dead
2122       if (Ty->isAggregateType()) {
2123         SmallVector<EVT, 16> vtparts;
2124 
2125         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2126         assert(vtparts.size() > 0 && "empty aggregate type not expected");
2127         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2128              ++parti) {
2129           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2130           ++InsIdx;
2131         }
2132         if (vtparts.size() > 0)
2133           --InsIdx;
2134         continue;
2135       }
2136       if (Ty->isVectorTy()) {
2137         EVT ObjectVT = getValueType(DL, Ty);
2138         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2139         for (unsigned parti = 0; parti < NumRegs; ++parti) {
2140           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2141           ++InsIdx;
2142         }
2143         if (NumRegs > 0)
2144           --InsIdx;
2145         continue;
2146       }
2147       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2148       continue;
2149     }
2150 
2151     // In the following cases, assign a node order of "idx+1"
2152     // to newly created nodes. The SDNodes for params have to
2153     // appear in the same order as their order of appearance
2154     // in the original function. "idx+1" holds that order.
2155     if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
2156       if (Ty->isAggregateType()) {
2157         SmallVector<EVT, 16> vtparts;
2158         SmallVector<uint64_t, 16> offsets;
2159 
2160         // NOTE: Here, we lose the ability to issue vector loads for vectors
2161         // that are a part of a struct.  This should be investigated in the
2162         // future.
2163         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
2164                            0);
2165         assert(vtparts.size() > 0 && "empty aggregate type not expected");
2166         bool aggregateIsPacked = false;
2167         if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
2168           aggregateIsPacked = STy->isPacked();
2169 
2170         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2171         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2172              ++parti) {
2173           EVT partVT = vtparts[parti];
2174           Value *srcValue = Constant::getNullValue(
2175               PointerType::get(partVT.getTypeForEVT(F->getContext()),
2176                                llvm::ADDRESS_SPACE_PARAM));
2177           SDValue srcAddr =
2178               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2179                           DAG.getConstant(offsets[parti], dl, PtrVT));
2180           unsigned partAlign = aggregateIsPacked
2181                                    ? 1
2182                                    : DL.getABITypeAlignment(
2183                                          partVT.getTypeForEVT(F->getContext()));
2184           SDValue p;
2185           if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
2186             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
2187                                      ISD::SEXTLOAD : ISD::ZEXTLOAD;
2188             p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
2189                                MachinePointerInfo(srcValue), partVT, false,
2190                                false, false, partAlign);
2191           } else {
2192             p = DAG.getLoad(partVT, dl, Root, srcAddr,
2193                             MachinePointerInfo(srcValue), false, false, false,
2194                             partAlign);
2195           }
2196           if (p.getNode())
2197             p.getNode()->setIROrder(idx + 1);
2198           InVals.push_back(p);
2199           ++InsIdx;
2200         }
2201         if (vtparts.size() > 0)
2202           --InsIdx;
2203         continue;
2204       }
2205       if (Ty->isVectorTy()) {
2206         EVT ObjectVT = getValueType(DL, Ty);
2207         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2208         unsigned NumElts = ObjectVT.getVectorNumElements();
2209         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
2210                "Vector was not scalarized");
2211         EVT EltVT = ObjectVT.getVectorElementType();
2212 
2213         // V1 load
2214         // f32 = load ...
2215         if (NumElts == 1) {
2216           // We only have one element, so just directly load it
2217           Value *SrcValue = Constant::getNullValue(PointerType::get(
2218               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2219           SDValue P = DAG.getLoad(
2220               EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
2221               true,
2222               DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
2223           if (P.getNode())
2224             P.getNode()->setIROrder(idx + 1);
2225 
2226           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
2227             P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
2228           InVals.push_back(P);
2229           ++InsIdx;
2230         } else if (NumElts == 2) {
2231           // V2 load
2232           // f32,f32 = load ...
2233           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
2234           Value *SrcValue = Constant::getNullValue(PointerType::get(
2235               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2236           SDValue P = DAG.getLoad(
2237               VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
2238               true,
2239               DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
2240           if (P.getNode())
2241             P.getNode()->setIROrder(idx + 1);
2242 
2243           SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2244                                      DAG.getIntPtrConstant(0, dl));
2245           SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2246                                      DAG.getIntPtrConstant(1, dl));
2247 
2248           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
2249             Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
2250             Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
2251           }
2252 
2253           InVals.push_back(Elt0);
2254           InVals.push_back(Elt1);
2255           InsIdx += 2;
2256         } else {
2257           // V4 loads
2258           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
2259           // the
2260           // vector will be expanded to a power of 2 elements, so we know we can
2261           // always round up to the next multiple of 4 when creating the vector
2262           // loads.
2263           // e.g.  4 elem => 1 ld.v4
2264           //       6 elem => 2 ld.v4
2265           //       8 elem => 2 ld.v4
2266           //      11 elem => 3 ld.v4
2267           unsigned VecSize = 4;
2268           if (EltVT.getSizeInBits() == 64) {
2269             VecSize = 2;
2270           }
2271           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
2272           unsigned Ofst = 0;
2273           for (unsigned i = 0; i < NumElts; i += VecSize) {
2274             Value *SrcValue = Constant::getNullValue(
2275                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
2276                                  llvm::ADDRESS_SPACE_PARAM));
2277             SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2278                                           DAG.getConstant(Ofst, dl, PtrVT));
2279             SDValue P = DAG.getLoad(
2280                 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
2281                 false, true,
2282                 DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
2283             if (P.getNode())
2284               P.getNode()->setIROrder(idx + 1);
2285 
2286             for (unsigned j = 0; j < VecSize; ++j) {
2287               if (i + j >= NumElts)
2288                 break;
2289               SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2290                                         DAG.getIntPtrConstant(j, dl));
2291               if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
2292                 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
2293               InVals.push_back(Elt);
2294             }
2295             Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
2296           }
2297           InsIdx += NumElts;
2298         }
2299 
2300         if (NumElts > 0)
2301           --InsIdx;
2302         continue;
2303       }
2304       // A plain scalar.
2305       EVT ObjectVT = getValueType(DL, Ty);
2306       // If ABI, load from the param symbol
2307       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2308       Value *srcValue = Constant::getNullValue(PointerType::get(
2309           ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2310       SDValue p;
2311        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
2312         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
2313                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
2314         p = DAG.getExtLoad(
2315             ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
2316             ObjectVT, false, false, false,
2317             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
2318       } else {
2319         p = DAG.getLoad(
2320             Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
2321             false, false,
2322             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
2323       }
2324       if (p.getNode())
2325         p.getNode()->setIROrder(idx + 1);
2326       InVals.push_back(p);
2327       continue;
2328     }
2329 
2330     // Param has ByVal attribute
2331     // Return MoveParam(param symbol).
2332     // Ideally, the param symbol can be returned directly,
2333     // but when SDNode builder decides to use it in a CopyToReg(),
2334     // machine instruction fails because TargetExternalSymbol
2335     // (not lowered) is target dependent, and CopyToReg assumes
2336     // the source is lowered.
2337     EVT ObjectVT = getValueType(DL, Ty);
2338     assert(ObjectVT == Ins[InsIdx].VT &&
2339            "Ins type did not match function type");
2340     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2341     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2342     if (p.getNode())
2343       p.getNode()->setIROrder(idx + 1);
2344     if (isKernel)
2345       InVals.push_back(p);
2346     else {
2347       SDValue p2 = DAG.getNode(
2348           ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
2349           DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
2350       InVals.push_back(p2);
2351     }
2352   }
2353 
2354   // Clang will check explicit VarArg and issue error if any. However, Clang
2355   // will let code with
2356   // implicit var arg like f() pass. See bug 617733.
2357   // We treat this case as if the arg list is empty.
2358   // if (F.isVarArg()) {
2359   // assert(0 && "VarArg not supported yet!");
2360   //}
2361 
2362   if (!OutChains.empty())
2363     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2364 
2365   return Chain;
2366 }
2367 
2368 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const2369 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2370                                  bool isVarArg,
2371                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
2372                                  const SmallVectorImpl<SDValue> &OutVals,
2373                                  const SDLoc &dl, SelectionDAG &DAG) const {
2374   MachineFunction &MF = DAG.getMachineFunction();
2375   const Function *F = MF.getFunction();
2376   Type *RetTy = F->getReturnType();
2377   const DataLayout &TD = DAG.getDataLayout();
2378 
2379   bool isABI = (STI.getSmVersion() >= 20);
2380   assert(isABI && "Non-ABI compilation is not supported");
2381   if (!isABI)
2382     return Chain;
2383 
2384   if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
2385     // If we have a vector type, the OutVals array will be the scalarized
2386     // components and we have combine them into 1 or more vector stores.
2387     unsigned NumElts = VTy->getNumElements();
2388     assert(NumElts == Outs.size() && "Bad scalarization of return value");
2389 
2390     // const_cast can be removed in later LLVM versions
2391     EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
2392     bool NeedExtend = false;
2393     if (EltVT.getSizeInBits() < 16)
2394       NeedExtend = true;
2395 
2396     // V1 store
2397     if (NumElts == 1) {
2398       SDValue StoreVal = OutVals[0];
2399       // We only have one element, so just directly store it
2400       if (NeedExtend)
2401         StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
2402       SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
2403       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
2404                                       DAG.getVTList(MVT::Other), Ops,
2405                                       EltVT, MachinePointerInfo());
2406 
2407     } else if (NumElts == 2) {
2408       // V2 store
2409       SDValue StoreVal0 = OutVals[0];
2410       SDValue StoreVal1 = OutVals[1];
2411 
2412       if (NeedExtend) {
2413         StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
2414         StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
2415       }
2416 
2417       SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
2418                         StoreVal1 };
2419       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
2420                                       DAG.getVTList(MVT::Other), Ops,
2421                                       EltVT, MachinePointerInfo());
2422     } else {
2423       // V4 stores
2424       // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
2425       // vector will be expanded to a power of 2 elements, so we know we can
2426       // always round up to the next multiple of 4 when creating the vector
2427       // stores.
2428       // e.g.  4 elem => 1 st.v4
2429       //       6 elem => 2 st.v4
2430       //       8 elem => 2 st.v4
2431       //      11 elem => 3 st.v4
2432 
2433       unsigned VecSize = 4;
2434       if (OutVals[0].getValueType().getSizeInBits() == 64)
2435         VecSize = 2;
2436 
2437       unsigned Offset = 0;
2438 
2439       EVT VecVT =
2440           EVT::getVectorVT(F->getContext(), EltVT, VecSize);
2441       unsigned PerStoreOffset =
2442           TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
2443 
2444       for (unsigned i = 0; i < NumElts; i += VecSize) {
2445         // Get values
2446         SDValue StoreVal;
2447         SmallVector<SDValue, 8> Ops;
2448         Ops.push_back(Chain);
2449         Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
2450         unsigned Opc = NVPTXISD::StoreRetvalV2;
2451         EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
2452 
2453         StoreVal = OutVals[i];
2454         if (NeedExtend)
2455           StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2456         Ops.push_back(StoreVal);
2457 
2458         if (i + 1 < NumElts) {
2459           StoreVal = OutVals[i + 1];
2460           if (NeedExtend)
2461             StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2462         } else {
2463           StoreVal = DAG.getUNDEF(ExtendedVT);
2464         }
2465         Ops.push_back(StoreVal);
2466 
2467         if (VecSize == 4) {
2468           Opc = NVPTXISD::StoreRetvalV4;
2469           if (i + 2 < NumElts) {
2470             StoreVal = OutVals[i + 2];
2471             if (NeedExtend)
2472               StoreVal =
2473                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2474           } else {
2475             StoreVal = DAG.getUNDEF(ExtendedVT);
2476           }
2477           Ops.push_back(StoreVal);
2478 
2479           if (i + 3 < NumElts) {
2480             StoreVal = OutVals[i + 3];
2481             if (NeedExtend)
2482               StoreVal =
2483                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2484           } else {
2485             StoreVal = DAG.getUNDEF(ExtendedVT);
2486           }
2487           Ops.push_back(StoreVal);
2488         }
2489 
2490         // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
2491         Chain =
2492             DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
2493                                     EltVT, MachinePointerInfo());
2494         Offset += PerStoreOffset;
2495       }
2496     }
2497   } else {
2498     SmallVector<EVT, 16> ValVTs;
2499     SmallVector<uint64_t, 16> Offsets;
2500     ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
2501     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
2502 
2503     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
2504       SDValue theVal = OutVals[i];
2505       EVT TheValType = theVal.getValueType();
2506       unsigned numElems = 1;
2507       if (TheValType.isVector())
2508         numElems = TheValType.getVectorNumElements();
2509       for (unsigned j = 0, je = numElems; j != je; ++j) {
2510         SDValue TmpVal = theVal;
2511         if (TheValType.isVector())
2512           TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
2513                                TheValType.getVectorElementType(), TmpVal,
2514                                DAG.getIntPtrConstant(j, dl));
2515         EVT TheStoreType = ValVTs[i];
2516         if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
2517           // The following zero-extension is for integer types only, and
2518           // specifically not for aggregates.
2519           TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
2520           TheStoreType = MVT::i32;
2521         }
2522         else if (TmpVal.getValueType().getSizeInBits() < 16)
2523           TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
2524 
2525         SDValue Ops[] = {
2526           Chain,
2527           DAG.getConstant(Offsets[i], dl, MVT::i32),
2528           TmpVal };
2529         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
2530                                         DAG.getVTList(MVT::Other), Ops,
2531                                         TheStoreType,
2532                                         MachinePointerInfo());
2533       }
2534     }
2535   }
2536 
2537   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2538 }
2539 
2540 
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const2541 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2542     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2543     SelectionDAG &DAG) const {
2544   if (Constraint.length() > 1)
2545     return;
2546   else
2547     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2548 }
2549 
getOpcForTextureInstr(unsigned Intrinsic)2550 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2551   switch (Intrinsic) {
2552   default:
2553     return 0;
2554 
2555   case Intrinsic::nvvm_tex_1d_v4f32_s32:
2556     return NVPTXISD::Tex1DFloatS32;
2557   case Intrinsic::nvvm_tex_1d_v4f32_f32:
2558     return NVPTXISD::Tex1DFloatFloat;
2559   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2560     return NVPTXISD::Tex1DFloatFloatLevel;
2561   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2562     return NVPTXISD::Tex1DFloatFloatGrad;
2563   case Intrinsic::nvvm_tex_1d_v4s32_s32:
2564     return NVPTXISD::Tex1DS32S32;
2565   case Intrinsic::nvvm_tex_1d_v4s32_f32:
2566     return NVPTXISD::Tex1DS32Float;
2567   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2568     return NVPTXISD::Tex1DS32FloatLevel;
2569   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2570     return NVPTXISD::Tex1DS32FloatGrad;
2571   case Intrinsic::nvvm_tex_1d_v4u32_s32:
2572     return NVPTXISD::Tex1DU32S32;
2573   case Intrinsic::nvvm_tex_1d_v4u32_f32:
2574     return NVPTXISD::Tex1DU32Float;
2575   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2576     return NVPTXISD::Tex1DU32FloatLevel;
2577   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2578     return NVPTXISD::Tex1DU32FloatGrad;
2579 
2580   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2581     return NVPTXISD::Tex1DArrayFloatS32;
2582   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2583     return NVPTXISD::Tex1DArrayFloatFloat;
2584   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2585     return NVPTXISD::Tex1DArrayFloatFloatLevel;
2586   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2587     return NVPTXISD::Tex1DArrayFloatFloatGrad;
2588   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2589     return NVPTXISD::Tex1DArrayS32S32;
2590   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2591     return NVPTXISD::Tex1DArrayS32Float;
2592   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2593     return NVPTXISD::Tex1DArrayS32FloatLevel;
2594   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2595     return NVPTXISD::Tex1DArrayS32FloatGrad;
2596   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2597     return NVPTXISD::Tex1DArrayU32S32;
2598   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2599     return NVPTXISD::Tex1DArrayU32Float;
2600   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2601     return NVPTXISD::Tex1DArrayU32FloatLevel;
2602   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2603     return NVPTXISD::Tex1DArrayU32FloatGrad;
2604 
2605   case Intrinsic::nvvm_tex_2d_v4f32_s32:
2606     return NVPTXISD::Tex2DFloatS32;
2607   case Intrinsic::nvvm_tex_2d_v4f32_f32:
2608     return NVPTXISD::Tex2DFloatFloat;
2609   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2610     return NVPTXISD::Tex2DFloatFloatLevel;
2611   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2612     return NVPTXISD::Tex2DFloatFloatGrad;
2613   case Intrinsic::nvvm_tex_2d_v4s32_s32:
2614     return NVPTXISD::Tex2DS32S32;
2615   case Intrinsic::nvvm_tex_2d_v4s32_f32:
2616     return NVPTXISD::Tex2DS32Float;
2617   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2618     return NVPTXISD::Tex2DS32FloatLevel;
2619   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2620     return NVPTXISD::Tex2DS32FloatGrad;
2621   case Intrinsic::nvvm_tex_2d_v4u32_s32:
2622     return NVPTXISD::Tex2DU32S32;
2623   case Intrinsic::nvvm_tex_2d_v4u32_f32:
2624     return NVPTXISD::Tex2DU32Float;
2625   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2626     return NVPTXISD::Tex2DU32FloatLevel;
2627   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2628     return NVPTXISD::Tex2DU32FloatGrad;
2629 
2630   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2631     return NVPTXISD::Tex2DArrayFloatS32;
2632   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2633     return NVPTXISD::Tex2DArrayFloatFloat;
2634   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2635     return NVPTXISD::Tex2DArrayFloatFloatLevel;
2636   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2637     return NVPTXISD::Tex2DArrayFloatFloatGrad;
2638   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2639     return NVPTXISD::Tex2DArrayS32S32;
2640   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2641     return NVPTXISD::Tex2DArrayS32Float;
2642   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2643     return NVPTXISD::Tex2DArrayS32FloatLevel;
2644   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2645     return NVPTXISD::Tex2DArrayS32FloatGrad;
2646   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2647     return NVPTXISD::Tex2DArrayU32S32;
2648   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2649     return NVPTXISD::Tex2DArrayU32Float;
2650   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2651     return NVPTXISD::Tex2DArrayU32FloatLevel;
2652   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2653     return NVPTXISD::Tex2DArrayU32FloatGrad;
2654 
2655   case Intrinsic::nvvm_tex_3d_v4f32_s32:
2656     return NVPTXISD::Tex3DFloatS32;
2657   case Intrinsic::nvvm_tex_3d_v4f32_f32:
2658     return NVPTXISD::Tex3DFloatFloat;
2659   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2660     return NVPTXISD::Tex3DFloatFloatLevel;
2661   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2662     return NVPTXISD::Tex3DFloatFloatGrad;
2663   case Intrinsic::nvvm_tex_3d_v4s32_s32:
2664     return NVPTXISD::Tex3DS32S32;
2665   case Intrinsic::nvvm_tex_3d_v4s32_f32:
2666     return NVPTXISD::Tex3DS32Float;
2667   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2668     return NVPTXISD::Tex3DS32FloatLevel;
2669   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2670     return NVPTXISD::Tex3DS32FloatGrad;
2671   case Intrinsic::nvvm_tex_3d_v4u32_s32:
2672     return NVPTXISD::Tex3DU32S32;
2673   case Intrinsic::nvvm_tex_3d_v4u32_f32:
2674     return NVPTXISD::Tex3DU32Float;
2675   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2676     return NVPTXISD::Tex3DU32FloatLevel;
2677   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2678     return NVPTXISD::Tex3DU32FloatGrad;
2679 
2680   case Intrinsic::nvvm_tex_cube_v4f32_f32:
2681     return NVPTXISD::TexCubeFloatFloat;
2682   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2683     return NVPTXISD::TexCubeFloatFloatLevel;
2684   case Intrinsic::nvvm_tex_cube_v4s32_f32:
2685     return NVPTXISD::TexCubeS32Float;
2686   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2687     return NVPTXISD::TexCubeS32FloatLevel;
2688   case Intrinsic::nvvm_tex_cube_v4u32_f32:
2689     return NVPTXISD::TexCubeU32Float;
2690   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2691     return NVPTXISD::TexCubeU32FloatLevel;
2692 
2693   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2694     return NVPTXISD::TexCubeArrayFloatFloat;
2695   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2696     return NVPTXISD::TexCubeArrayFloatFloatLevel;
2697   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2698     return NVPTXISD::TexCubeArrayS32Float;
2699   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2700     return NVPTXISD::TexCubeArrayS32FloatLevel;
2701   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2702     return NVPTXISD::TexCubeArrayU32Float;
2703   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2704     return NVPTXISD::TexCubeArrayU32FloatLevel;
2705 
2706   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2707     return NVPTXISD::Tld4R2DFloatFloat;
2708   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2709     return NVPTXISD::Tld4G2DFloatFloat;
2710   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2711     return NVPTXISD::Tld4B2DFloatFloat;
2712   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2713     return NVPTXISD::Tld4A2DFloatFloat;
2714   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2715     return NVPTXISD::Tld4R2DS64Float;
2716   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2717     return NVPTXISD::Tld4G2DS64Float;
2718   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2719     return NVPTXISD::Tld4B2DS64Float;
2720   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2721     return NVPTXISD::Tld4A2DS64Float;
2722   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2723     return NVPTXISD::Tld4R2DU64Float;
2724   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2725     return NVPTXISD::Tld4G2DU64Float;
2726   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2727     return NVPTXISD::Tld4B2DU64Float;
2728   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2729     return NVPTXISD::Tld4A2DU64Float;
2730 
2731   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2732     return NVPTXISD::TexUnified1DFloatS32;
2733   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2734     return NVPTXISD::TexUnified1DFloatFloat;
2735   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2736     return NVPTXISD::TexUnified1DFloatFloatLevel;
2737   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2738     return NVPTXISD::TexUnified1DFloatFloatGrad;
2739   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2740     return NVPTXISD::TexUnified1DS32S32;
2741   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2742     return NVPTXISD::TexUnified1DS32Float;
2743   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2744     return NVPTXISD::TexUnified1DS32FloatLevel;
2745   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2746     return NVPTXISD::TexUnified1DS32FloatGrad;
2747   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2748     return NVPTXISD::TexUnified1DU32S32;
2749   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2750     return NVPTXISD::TexUnified1DU32Float;
2751   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2752     return NVPTXISD::TexUnified1DU32FloatLevel;
2753   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2754     return NVPTXISD::TexUnified1DU32FloatGrad;
2755 
2756   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2757     return NVPTXISD::TexUnified1DArrayFloatS32;
2758   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2759     return NVPTXISD::TexUnified1DArrayFloatFloat;
2760   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2761     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
2762   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2763     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
2764   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2765     return NVPTXISD::TexUnified1DArrayS32S32;
2766   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2767     return NVPTXISD::TexUnified1DArrayS32Float;
2768   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2769     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
2770   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2771     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
2772   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2773     return NVPTXISD::TexUnified1DArrayU32S32;
2774   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2775     return NVPTXISD::TexUnified1DArrayU32Float;
2776   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2777     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
2778   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2779     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
2780 
2781   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2782     return NVPTXISD::TexUnified2DFloatS32;
2783   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2784     return NVPTXISD::TexUnified2DFloatFloat;
2785   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2786     return NVPTXISD::TexUnified2DFloatFloatLevel;
2787   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2788     return NVPTXISD::TexUnified2DFloatFloatGrad;
2789   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2790     return NVPTXISD::TexUnified2DS32S32;
2791   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2792     return NVPTXISD::TexUnified2DS32Float;
2793   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2794     return NVPTXISD::TexUnified2DS32FloatLevel;
2795   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2796     return NVPTXISD::TexUnified2DS32FloatGrad;
2797   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2798     return NVPTXISD::TexUnified2DU32S32;
2799   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2800     return NVPTXISD::TexUnified2DU32Float;
2801   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2802     return NVPTXISD::TexUnified2DU32FloatLevel;
2803   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2804     return NVPTXISD::TexUnified2DU32FloatGrad;
2805 
2806   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2807     return NVPTXISD::TexUnified2DArrayFloatS32;
2808   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2809     return NVPTXISD::TexUnified2DArrayFloatFloat;
2810   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2811     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
2812   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2813     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
2814   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2815     return NVPTXISD::TexUnified2DArrayS32S32;
2816   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2817     return NVPTXISD::TexUnified2DArrayS32Float;
2818   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2819     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
2820   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2821     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
2822   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2823     return NVPTXISD::TexUnified2DArrayU32S32;
2824   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2825     return NVPTXISD::TexUnified2DArrayU32Float;
2826   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2827     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
2828   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2829     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
2830 
2831   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2832     return NVPTXISD::TexUnified3DFloatS32;
2833   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2834     return NVPTXISD::TexUnified3DFloatFloat;
2835   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2836     return NVPTXISD::TexUnified3DFloatFloatLevel;
2837   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2838     return NVPTXISD::TexUnified3DFloatFloatGrad;
2839   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2840     return NVPTXISD::TexUnified3DS32S32;
2841   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2842     return NVPTXISD::TexUnified3DS32Float;
2843   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2844     return NVPTXISD::TexUnified3DS32FloatLevel;
2845   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2846     return NVPTXISD::TexUnified3DS32FloatGrad;
2847   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2848     return NVPTXISD::TexUnified3DU32S32;
2849   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2850     return NVPTXISD::TexUnified3DU32Float;
2851   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2852     return NVPTXISD::TexUnified3DU32FloatLevel;
2853   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2854     return NVPTXISD::TexUnified3DU32FloatGrad;
2855 
2856   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2857     return NVPTXISD::TexUnifiedCubeFloatFloat;
2858   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2859     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
2860   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2861     return NVPTXISD::TexUnifiedCubeS32Float;
2862   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2863     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
2864   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2865     return NVPTXISD::TexUnifiedCubeU32Float;
2866   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2867     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
2868 
2869   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2870     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
2871   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2872     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
2873   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2874     return NVPTXISD::TexUnifiedCubeArrayS32Float;
2875   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2876     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
2877   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2878     return NVPTXISD::TexUnifiedCubeArrayU32Float;
2879   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2880     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
2881 
2882   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2883     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
2884   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2885     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
2886   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2887     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
2888   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2889     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
2890   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2891     return NVPTXISD::Tld4UnifiedR2DS64Float;
2892   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2893     return NVPTXISD::Tld4UnifiedG2DS64Float;
2894   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2895     return NVPTXISD::Tld4UnifiedB2DS64Float;
2896   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
2897     return NVPTXISD::Tld4UnifiedA2DS64Float;
2898   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
2899     return NVPTXISD::Tld4UnifiedR2DU64Float;
2900   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
2901     return NVPTXISD::Tld4UnifiedG2DU64Float;
2902   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
2903     return NVPTXISD::Tld4UnifiedB2DU64Float;
2904   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
2905     return NVPTXISD::Tld4UnifiedA2DU64Float;
2906   }
2907 }
2908 
getOpcForSurfaceInstr(unsigned Intrinsic)2909 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
2910   switch (Intrinsic) {
2911   default:
2912     return 0;
2913   case Intrinsic::nvvm_suld_1d_i8_clamp:
2914     return NVPTXISD::Suld1DI8Clamp;
2915   case Intrinsic::nvvm_suld_1d_i16_clamp:
2916     return NVPTXISD::Suld1DI16Clamp;
2917   case Intrinsic::nvvm_suld_1d_i32_clamp:
2918     return NVPTXISD::Suld1DI32Clamp;
2919   case Intrinsic::nvvm_suld_1d_i64_clamp:
2920     return NVPTXISD::Suld1DI64Clamp;
2921   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
2922     return NVPTXISD::Suld1DV2I8Clamp;
2923   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
2924     return NVPTXISD::Suld1DV2I16Clamp;
2925   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
2926     return NVPTXISD::Suld1DV2I32Clamp;
2927   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
2928     return NVPTXISD::Suld1DV2I64Clamp;
2929   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
2930     return NVPTXISD::Suld1DV4I8Clamp;
2931   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
2932     return NVPTXISD::Suld1DV4I16Clamp;
2933   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
2934     return NVPTXISD::Suld1DV4I32Clamp;
2935   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
2936     return NVPTXISD::Suld1DArrayI8Clamp;
2937   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
2938     return NVPTXISD::Suld1DArrayI16Clamp;
2939   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
2940     return NVPTXISD::Suld1DArrayI32Clamp;
2941   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
2942     return NVPTXISD::Suld1DArrayI64Clamp;
2943   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
2944     return NVPTXISD::Suld1DArrayV2I8Clamp;
2945   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
2946     return NVPTXISD::Suld1DArrayV2I16Clamp;
2947   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
2948     return NVPTXISD::Suld1DArrayV2I32Clamp;
2949   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
2950     return NVPTXISD::Suld1DArrayV2I64Clamp;
2951   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
2952     return NVPTXISD::Suld1DArrayV4I8Clamp;
2953   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
2954     return NVPTXISD::Suld1DArrayV4I16Clamp;
2955   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
2956     return NVPTXISD::Suld1DArrayV4I32Clamp;
2957   case Intrinsic::nvvm_suld_2d_i8_clamp:
2958     return NVPTXISD::Suld2DI8Clamp;
2959   case Intrinsic::nvvm_suld_2d_i16_clamp:
2960     return NVPTXISD::Suld2DI16Clamp;
2961   case Intrinsic::nvvm_suld_2d_i32_clamp:
2962     return NVPTXISD::Suld2DI32Clamp;
2963   case Intrinsic::nvvm_suld_2d_i64_clamp:
2964     return NVPTXISD::Suld2DI64Clamp;
2965   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
2966     return NVPTXISD::Suld2DV2I8Clamp;
2967   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
2968     return NVPTXISD::Suld2DV2I16Clamp;
2969   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
2970     return NVPTXISD::Suld2DV2I32Clamp;
2971   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
2972     return NVPTXISD::Suld2DV2I64Clamp;
2973   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
2974     return NVPTXISD::Suld2DV4I8Clamp;
2975   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
2976     return NVPTXISD::Suld2DV4I16Clamp;
2977   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
2978     return NVPTXISD::Suld2DV4I32Clamp;
2979   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
2980     return NVPTXISD::Suld2DArrayI8Clamp;
2981   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
2982     return NVPTXISD::Suld2DArrayI16Clamp;
2983   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
2984     return NVPTXISD::Suld2DArrayI32Clamp;
2985   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
2986     return NVPTXISD::Suld2DArrayI64Clamp;
2987   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
2988     return NVPTXISD::Suld2DArrayV2I8Clamp;
2989   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
2990     return NVPTXISD::Suld2DArrayV2I16Clamp;
2991   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
2992     return NVPTXISD::Suld2DArrayV2I32Clamp;
2993   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
2994     return NVPTXISD::Suld2DArrayV2I64Clamp;
2995   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
2996     return NVPTXISD::Suld2DArrayV4I8Clamp;
2997   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
2998     return NVPTXISD::Suld2DArrayV4I16Clamp;
2999   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3000     return NVPTXISD::Suld2DArrayV4I32Clamp;
3001   case Intrinsic::nvvm_suld_3d_i8_clamp:
3002     return NVPTXISD::Suld3DI8Clamp;
3003   case Intrinsic::nvvm_suld_3d_i16_clamp:
3004     return NVPTXISD::Suld3DI16Clamp;
3005   case Intrinsic::nvvm_suld_3d_i32_clamp:
3006     return NVPTXISD::Suld3DI32Clamp;
3007   case Intrinsic::nvvm_suld_3d_i64_clamp:
3008     return NVPTXISD::Suld3DI64Clamp;
3009   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3010     return NVPTXISD::Suld3DV2I8Clamp;
3011   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3012     return NVPTXISD::Suld3DV2I16Clamp;
3013   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3014     return NVPTXISD::Suld3DV2I32Clamp;
3015   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3016     return NVPTXISD::Suld3DV2I64Clamp;
3017   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3018     return NVPTXISD::Suld3DV4I8Clamp;
3019   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3020     return NVPTXISD::Suld3DV4I16Clamp;
3021   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3022     return NVPTXISD::Suld3DV4I32Clamp;
3023   case Intrinsic::nvvm_suld_1d_i8_trap:
3024     return NVPTXISD::Suld1DI8Trap;
3025   case Intrinsic::nvvm_suld_1d_i16_trap:
3026     return NVPTXISD::Suld1DI16Trap;
3027   case Intrinsic::nvvm_suld_1d_i32_trap:
3028     return NVPTXISD::Suld1DI32Trap;
3029   case Intrinsic::nvvm_suld_1d_i64_trap:
3030     return NVPTXISD::Suld1DI64Trap;
3031   case Intrinsic::nvvm_suld_1d_v2i8_trap:
3032     return NVPTXISD::Suld1DV2I8Trap;
3033   case Intrinsic::nvvm_suld_1d_v2i16_trap:
3034     return NVPTXISD::Suld1DV2I16Trap;
3035   case Intrinsic::nvvm_suld_1d_v2i32_trap:
3036     return NVPTXISD::Suld1DV2I32Trap;
3037   case Intrinsic::nvvm_suld_1d_v2i64_trap:
3038     return NVPTXISD::Suld1DV2I64Trap;
3039   case Intrinsic::nvvm_suld_1d_v4i8_trap:
3040     return NVPTXISD::Suld1DV4I8Trap;
3041   case Intrinsic::nvvm_suld_1d_v4i16_trap:
3042     return NVPTXISD::Suld1DV4I16Trap;
3043   case Intrinsic::nvvm_suld_1d_v4i32_trap:
3044     return NVPTXISD::Suld1DV4I32Trap;
3045   case Intrinsic::nvvm_suld_1d_array_i8_trap:
3046     return NVPTXISD::Suld1DArrayI8Trap;
3047   case Intrinsic::nvvm_suld_1d_array_i16_trap:
3048     return NVPTXISD::Suld1DArrayI16Trap;
3049   case Intrinsic::nvvm_suld_1d_array_i32_trap:
3050     return NVPTXISD::Suld1DArrayI32Trap;
3051   case Intrinsic::nvvm_suld_1d_array_i64_trap:
3052     return NVPTXISD::Suld1DArrayI64Trap;
3053   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3054     return NVPTXISD::Suld1DArrayV2I8Trap;
3055   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3056     return NVPTXISD::Suld1DArrayV2I16Trap;
3057   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3058     return NVPTXISD::Suld1DArrayV2I32Trap;
3059   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3060     return NVPTXISD::Suld1DArrayV2I64Trap;
3061   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3062     return NVPTXISD::Suld1DArrayV4I8Trap;
3063   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3064     return NVPTXISD::Suld1DArrayV4I16Trap;
3065   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3066     return NVPTXISD::Suld1DArrayV4I32Trap;
3067   case Intrinsic::nvvm_suld_2d_i8_trap:
3068     return NVPTXISD::Suld2DI8Trap;
3069   case Intrinsic::nvvm_suld_2d_i16_trap:
3070     return NVPTXISD::Suld2DI16Trap;
3071   case Intrinsic::nvvm_suld_2d_i32_trap:
3072     return NVPTXISD::Suld2DI32Trap;
3073   case Intrinsic::nvvm_suld_2d_i64_trap:
3074     return NVPTXISD::Suld2DI64Trap;
3075   case Intrinsic::nvvm_suld_2d_v2i8_trap:
3076     return NVPTXISD::Suld2DV2I8Trap;
3077   case Intrinsic::nvvm_suld_2d_v2i16_trap:
3078     return NVPTXISD::Suld2DV2I16Trap;
3079   case Intrinsic::nvvm_suld_2d_v2i32_trap:
3080     return NVPTXISD::Suld2DV2I32Trap;
3081   case Intrinsic::nvvm_suld_2d_v2i64_trap:
3082     return NVPTXISD::Suld2DV2I64Trap;
3083   case Intrinsic::nvvm_suld_2d_v4i8_trap:
3084     return NVPTXISD::Suld2DV4I8Trap;
3085   case Intrinsic::nvvm_suld_2d_v4i16_trap:
3086     return NVPTXISD::Suld2DV4I16Trap;
3087   case Intrinsic::nvvm_suld_2d_v4i32_trap:
3088     return NVPTXISD::Suld2DV4I32Trap;
3089   case Intrinsic::nvvm_suld_2d_array_i8_trap:
3090     return NVPTXISD::Suld2DArrayI8Trap;
3091   case Intrinsic::nvvm_suld_2d_array_i16_trap:
3092     return NVPTXISD::Suld2DArrayI16Trap;
3093   case Intrinsic::nvvm_suld_2d_array_i32_trap:
3094     return NVPTXISD::Suld2DArrayI32Trap;
3095   case Intrinsic::nvvm_suld_2d_array_i64_trap:
3096     return NVPTXISD::Suld2DArrayI64Trap;
3097   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3098     return NVPTXISD::Suld2DArrayV2I8Trap;
3099   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3100     return NVPTXISD::Suld2DArrayV2I16Trap;
3101   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3102     return NVPTXISD::Suld2DArrayV2I32Trap;
3103   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3104     return NVPTXISD::Suld2DArrayV2I64Trap;
3105   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3106     return NVPTXISD::Suld2DArrayV4I8Trap;
3107   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3108     return NVPTXISD::Suld2DArrayV4I16Trap;
3109   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3110     return NVPTXISD::Suld2DArrayV4I32Trap;
3111   case Intrinsic::nvvm_suld_3d_i8_trap:
3112     return NVPTXISD::Suld3DI8Trap;
3113   case Intrinsic::nvvm_suld_3d_i16_trap:
3114     return NVPTXISD::Suld3DI16Trap;
3115   case Intrinsic::nvvm_suld_3d_i32_trap:
3116     return NVPTXISD::Suld3DI32Trap;
3117   case Intrinsic::nvvm_suld_3d_i64_trap:
3118     return NVPTXISD::Suld3DI64Trap;
3119   case Intrinsic::nvvm_suld_3d_v2i8_trap:
3120     return NVPTXISD::Suld3DV2I8Trap;
3121   case Intrinsic::nvvm_suld_3d_v2i16_trap:
3122     return NVPTXISD::Suld3DV2I16Trap;
3123   case Intrinsic::nvvm_suld_3d_v2i32_trap:
3124     return NVPTXISD::Suld3DV2I32Trap;
3125   case Intrinsic::nvvm_suld_3d_v2i64_trap:
3126     return NVPTXISD::Suld3DV2I64Trap;
3127   case Intrinsic::nvvm_suld_3d_v4i8_trap:
3128     return NVPTXISD::Suld3DV4I8Trap;
3129   case Intrinsic::nvvm_suld_3d_v4i16_trap:
3130     return NVPTXISD::Suld3DV4I16Trap;
3131   case Intrinsic::nvvm_suld_3d_v4i32_trap:
3132     return NVPTXISD::Suld3DV4I32Trap;
3133   case Intrinsic::nvvm_suld_1d_i8_zero:
3134     return NVPTXISD::Suld1DI8Zero;
3135   case Intrinsic::nvvm_suld_1d_i16_zero:
3136     return NVPTXISD::Suld1DI16Zero;
3137   case Intrinsic::nvvm_suld_1d_i32_zero:
3138     return NVPTXISD::Suld1DI32Zero;
3139   case Intrinsic::nvvm_suld_1d_i64_zero:
3140     return NVPTXISD::Suld1DI64Zero;
3141   case Intrinsic::nvvm_suld_1d_v2i8_zero:
3142     return NVPTXISD::Suld1DV2I8Zero;
3143   case Intrinsic::nvvm_suld_1d_v2i16_zero:
3144     return NVPTXISD::Suld1DV2I16Zero;
3145   case Intrinsic::nvvm_suld_1d_v2i32_zero:
3146     return NVPTXISD::Suld1DV2I32Zero;
3147   case Intrinsic::nvvm_suld_1d_v2i64_zero:
3148     return NVPTXISD::Suld1DV2I64Zero;
3149   case Intrinsic::nvvm_suld_1d_v4i8_zero:
3150     return NVPTXISD::Suld1DV4I8Zero;
3151   case Intrinsic::nvvm_suld_1d_v4i16_zero:
3152     return NVPTXISD::Suld1DV4I16Zero;
3153   case Intrinsic::nvvm_suld_1d_v4i32_zero:
3154     return NVPTXISD::Suld1DV4I32Zero;
3155   case Intrinsic::nvvm_suld_1d_array_i8_zero:
3156     return NVPTXISD::Suld1DArrayI8Zero;
3157   case Intrinsic::nvvm_suld_1d_array_i16_zero:
3158     return NVPTXISD::Suld1DArrayI16Zero;
3159   case Intrinsic::nvvm_suld_1d_array_i32_zero:
3160     return NVPTXISD::Suld1DArrayI32Zero;
3161   case Intrinsic::nvvm_suld_1d_array_i64_zero:
3162     return NVPTXISD::Suld1DArrayI64Zero;
3163   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3164     return NVPTXISD::Suld1DArrayV2I8Zero;
3165   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3166     return NVPTXISD::Suld1DArrayV2I16Zero;
3167   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3168     return NVPTXISD::Suld1DArrayV2I32Zero;
3169   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3170     return NVPTXISD::Suld1DArrayV2I64Zero;
3171   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3172     return NVPTXISD::Suld1DArrayV4I8Zero;
3173   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3174     return NVPTXISD::Suld1DArrayV4I16Zero;
3175   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3176     return NVPTXISD::Suld1DArrayV4I32Zero;
3177   case Intrinsic::nvvm_suld_2d_i8_zero:
3178     return NVPTXISD::Suld2DI8Zero;
3179   case Intrinsic::nvvm_suld_2d_i16_zero:
3180     return NVPTXISD::Suld2DI16Zero;
3181   case Intrinsic::nvvm_suld_2d_i32_zero:
3182     return NVPTXISD::Suld2DI32Zero;
3183   case Intrinsic::nvvm_suld_2d_i64_zero:
3184     return NVPTXISD::Suld2DI64Zero;
3185   case Intrinsic::nvvm_suld_2d_v2i8_zero:
3186     return NVPTXISD::Suld2DV2I8Zero;
3187   case Intrinsic::nvvm_suld_2d_v2i16_zero:
3188     return NVPTXISD::Suld2DV2I16Zero;
3189   case Intrinsic::nvvm_suld_2d_v2i32_zero:
3190     return NVPTXISD::Suld2DV2I32Zero;
3191   case Intrinsic::nvvm_suld_2d_v2i64_zero:
3192     return NVPTXISD::Suld2DV2I64Zero;
3193   case Intrinsic::nvvm_suld_2d_v4i8_zero:
3194     return NVPTXISD::Suld2DV4I8Zero;
3195   case Intrinsic::nvvm_suld_2d_v4i16_zero:
3196     return NVPTXISD::Suld2DV4I16Zero;
3197   case Intrinsic::nvvm_suld_2d_v4i32_zero:
3198     return NVPTXISD::Suld2DV4I32Zero;
3199   case Intrinsic::nvvm_suld_2d_array_i8_zero:
3200     return NVPTXISD::Suld2DArrayI8Zero;
3201   case Intrinsic::nvvm_suld_2d_array_i16_zero:
3202     return NVPTXISD::Suld2DArrayI16Zero;
3203   case Intrinsic::nvvm_suld_2d_array_i32_zero:
3204     return NVPTXISD::Suld2DArrayI32Zero;
3205   case Intrinsic::nvvm_suld_2d_array_i64_zero:
3206     return NVPTXISD::Suld2DArrayI64Zero;
3207   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3208     return NVPTXISD::Suld2DArrayV2I8Zero;
3209   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3210     return NVPTXISD::Suld2DArrayV2I16Zero;
3211   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3212     return NVPTXISD::Suld2DArrayV2I32Zero;
3213   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3214     return NVPTXISD::Suld2DArrayV2I64Zero;
3215   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3216     return NVPTXISD::Suld2DArrayV4I8Zero;
3217   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3218     return NVPTXISD::Suld2DArrayV4I16Zero;
3219   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3220     return NVPTXISD::Suld2DArrayV4I32Zero;
3221   case Intrinsic::nvvm_suld_3d_i8_zero:
3222     return NVPTXISD::Suld3DI8Zero;
3223   case Intrinsic::nvvm_suld_3d_i16_zero:
3224     return NVPTXISD::Suld3DI16Zero;
3225   case Intrinsic::nvvm_suld_3d_i32_zero:
3226     return NVPTXISD::Suld3DI32Zero;
3227   case Intrinsic::nvvm_suld_3d_i64_zero:
3228     return NVPTXISD::Suld3DI64Zero;
3229   case Intrinsic::nvvm_suld_3d_v2i8_zero:
3230     return NVPTXISD::Suld3DV2I8Zero;
3231   case Intrinsic::nvvm_suld_3d_v2i16_zero:
3232     return NVPTXISD::Suld3DV2I16Zero;
3233   case Intrinsic::nvvm_suld_3d_v2i32_zero:
3234     return NVPTXISD::Suld3DV2I32Zero;
3235   case Intrinsic::nvvm_suld_3d_v2i64_zero:
3236     return NVPTXISD::Suld3DV2I64Zero;
3237   case Intrinsic::nvvm_suld_3d_v4i8_zero:
3238     return NVPTXISD::Suld3DV4I8Zero;
3239   case Intrinsic::nvvm_suld_3d_v4i16_zero:
3240     return NVPTXISD::Suld3DV4I16Zero;
3241   case Intrinsic::nvvm_suld_3d_v4i32_zero:
3242     return NVPTXISD::Suld3DV4I32Zero;
3243   }
3244 }
3245 
3246 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3247 // TgtMemIntrinsic
3248 // because we need the information that is only available in the "Value" type
3249 // of destination
3250 // pointer. In particular, the address space information.
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & I,unsigned Intrinsic) const3251 bool NVPTXTargetLowering::getTgtMemIntrinsic(
3252     IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
3253   switch (Intrinsic) {
3254   default:
3255     return false;
3256 
3257   case Intrinsic::nvvm_atomic_load_add_f32:
3258     Info.opc = ISD::INTRINSIC_W_CHAIN;
3259     Info.memVT = MVT::f32;
3260     Info.ptrVal = I.getArgOperand(0);
3261     Info.offset = 0;
3262     Info.vol = 0;
3263     Info.readMem = true;
3264     Info.writeMem = true;
3265     Info.align = 0;
3266     return true;
3267 
3268   case Intrinsic::nvvm_atomic_load_inc_32:
3269   case Intrinsic::nvvm_atomic_load_dec_32:
3270     Info.opc = ISD::INTRINSIC_W_CHAIN;
3271     Info.memVT = MVT::i32;
3272     Info.ptrVal = I.getArgOperand(0);
3273     Info.offset = 0;
3274     Info.vol = 0;
3275     Info.readMem = true;
3276     Info.writeMem = true;
3277     Info.align = 0;
3278     return true;
3279 
3280   case Intrinsic::nvvm_ldu_global_i:
3281   case Intrinsic::nvvm_ldu_global_f:
3282   case Intrinsic::nvvm_ldu_global_p: {
3283     auto &DL = I.getModule()->getDataLayout();
3284     Info.opc = ISD::INTRINSIC_W_CHAIN;
3285     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3286       Info.memVT = getValueType(DL, I.getType());
3287     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3288       Info.memVT = getPointerTy(DL);
3289     else
3290       Info.memVT = getValueType(DL, I.getType());
3291     Info.ptrVal = I.getArgOperand(0);
3292     Info.offset = 0;
3293     Info.vol = 0;
3294     Info.readMem = true;
3295     Info.writeMem = false;
3296     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3297 
3298     return true;
3299   }
3300   case Intrinsic::nvvm_ldg_global_i:
3301   case Intrinsic::nvvm_ldg_global_f:
3302   case Intrinsic::nvvm_ldg_global_p: {
3303     auto &DL = I.getModule()->getDataLayout();
3304 
3305     Info.opc = ISD::INTRINSIC_W_CHAIN;
3306     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3307       Info.memVT = getValueType(DL, I.getType());
3308     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3309       Info.memVT = getPointerTy(DL);
3310     else
3311       Info.memVT = getValueType(DL, I.getType());
3312     Info.ptrVal = I.getArgOperand(0);
3313     Info.offset = 0;
3314     Info.vol = 0;
3315     Info.readMem = true;
3316     Info.writeMem = false;
3317     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3318 
3319     return true;
3320   }
3321 
3322   case Intrinsic::nvvm_tex_1d_v4f32_s32:
3323   case Intrinsic::nvvm_tex_1d_v4f32_f32:
3324   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3325   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3326   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3327   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3328   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3329   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3330   case Intrinsic::nvvm_tex_2d_v4f32_s32:
3331   case Intrinsic::nvvm_tex_2d_v4f32_f32:
3332   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3333   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3334   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3335   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3336   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3337   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3338   case Intrinsic::nvvm_tex_3d_v4f32_s32:
3339   case Intrinsic::nvvm_tex_3d_v4f32_f32:
3340   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3341   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3342   case Intrinsic::nvvm_tex_cube_v4f32_f32:
3343   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3344   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3345   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3346   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3347   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3348   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3349   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3350   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3351   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3352   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3353   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3354   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3355   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3356   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3357   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3358   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3359   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3360   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3361   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3362   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3363   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3364   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3365   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3366   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3367   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3368   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3369   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3370   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3371   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3372   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3373   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3374   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3375   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3376   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3377   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
3378     Info.opc = getOpcForTextureInstr(Intrinsic);
3379     Info.memVT = MVT::v4f32;
3380     Info.ptrVal = nullptr;
3381     Info.offset = 0;
3382     Info.vol = 0;
3383     Info.readMem = true;
3384     Info.writeMem = false;
3385     Info.align = 16;
3386     return true;
3387   }
3388   case Intrinsic::nvvm_tex_1d_v4s32_s32:
3389   case Intrinsic::nvvm_tex_1d_v4s32_f32:
3390   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3391   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3392   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3393   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3394   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3395   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3396   case Intrinsic::nvvm_tex_2d_v4s32_s32:
3397   case Intrinsic::nvvm_tex_2d_v4s32_f32:
3398   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3399   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3400   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3401   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3402   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3403   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3404   case Intrinsic::nvvm_tex_3d_v4s32_s32:
3405   case Intrinsic::nvvm_tex_3d_v4s32_f32:
3406   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3407   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3408   case Intrinsic::nvvm_tex_cube_v4s32_f32:
3409   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3410   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3411   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3412   case Intrinsic::nvvm_tex_cube_v4u32_f32:
3413   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3414   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3415   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3416   case Intrinsic::nvvm_tex_1d_v4u32_s32:
3417   case Intrinsic::nvvm_tex_1d_v4u32_f32:
3418   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3419   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3420   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3421   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3422   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3423   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3424   case Intrinsic::nvvm_tex_2d_v4u32_s32:
3425   case Intrinsic::nvvm_tex_2d_v4u32_f32:
3426   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3427   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3428   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3429   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3430   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3431   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3432   case Intrinsic::nvvm_tex_3d_v4u32_s32:
3433   case Intrinsic::nvvm_tex_3d_v4u32_f32:
3434   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3435   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3436   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3437   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3438   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3439   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3440   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3441   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3442   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3443   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3444   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3445   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3446   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3447   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3448   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3449   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3450   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3451   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3452   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3453   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3454   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3455   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3456   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3457   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3458   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3459   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3460   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3461   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3462   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3463   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3464   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3465   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3466   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3467   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3468   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3469   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3470   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3471   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3472   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3473   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3474   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3475   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3476   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3477   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3478   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3479   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3480   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3481   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3482   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3483   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3484   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3485   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3486   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3487   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3488   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3489   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3490   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3491   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3492   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3493   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3494   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3495   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3496   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3497   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3498   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3499   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
3500     Info.opc = getOpcForTextureInstr(Intrinsic);
3501     Info.memVT = MVT::v4i32;
3502     Info.ptrVal = nullptr;
3503     Info.offset = 0;
3504     Info.vol = 0;
3505     Info.readMem = true;
3506     Info.writeMem = false;
3507     Info.align = 16;
3508     return true;
3509   }
3510   case Intrinsic::nvvm_suld_1d_i8_clamp:
3511   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3512   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3513   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3514   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3515   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3516   case Intrinsic::nvvm_suld_2d_i8_clamp:
3517   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3518   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3519   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3520   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3521   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3522   case Intrinsic::nvvm_suld_3d_i8_clamp:
3523   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3524   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3525   case Intrinsic::nvvm_suld_1d_i8_trap:
3526   case Intrinsic::nvvm_suld_1d_v2i8_trap:
3527   case Intrinsic::nvvm_suld_1d_v4i8_trap:
3528   case Intrinsic::nvvm_suld_1d_array_i8_trap:
3529   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3530   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3531   case Intrinsic::nvvm_suld_2d_i8_trap:
3532   case Intrinsic::nvvm_suld_2d_v2i8_trap:
3533   case Intrinsic::nvvm_suld_2d_v4i8_trap:
3534   case Intrinsic::nvvm_suld_2d_array_i8_trap:
3535   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3536   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3537   case Intrinsic::nvvm_suld_3d_i8_trap:
3538   case Intrinsic::nvvm_suld_3d_v2i8_trap:
3539   case Intrinsic::nvvm_suld_3d_v4i8_trap:
3540   case Intrinsic::nvvm_suld_1d_i8_zero:
3541   case Intrinsic::nvvm_suld_1d_v2i8_zero:
3542   case Intrinsic::nvvm_suld_1d_v4i8_zero:
3543   case Intrinsic::nvvm_suld_1d_array_i8_zero:
3544   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3545   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3546   case Intrinsic::nvvm_suld_2d_i8_zero:
3547   case Intrinsic::nvvm_suld_2d_v2i8_zero:
3548   case Intrinsic::nvvm_suld_2d_v4i8_zero:
3549   case Intrinsic::nvvm_suld_2d_array_i8_zero:
3550   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3551   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3552   case Intrinsic::nvvm_suld_3d_i8_zero:
3553   case Intrinsic::nvvm_suld_3d_v2i8_zero:
3554   case Intrinsic::nvvm_suld_3d_v4i8_zero: {
3555     Info.opc = getOpcForSurfaceInstr(Intrinsic);
3556     Info.memVT = MVT::i8;
3557     Info.ptrVal = nullptr;
3558     Info.offset = 0;
3559     Info.vol = 0;
3560     Info.readMem = true;
3561     Info.writeMem = false;
3562     Info.align = 16;
3563     return true;
3564   }
3565   case Intrinsic::nvvm_suld_1d_i16_clamp:
3566   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3567   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3568   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3569   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3570   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3571   case Intrinsic::nvvm_suld_2d_i16_clamp:
3572   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3573   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3574   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3575   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3576   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3577   case Intrinsic::nvvm_suld_3d_i16_clamp:
3578   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3579   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3580   case Intrinsic::nvvm_suld_1d_i16_trap:
3581   case Intrinsic::nvvm_suld_1d_v2i16_trap:
3582   case Intrinsic::nvvm_suld_1d_v4i16_trap:
3583   case Intrinsic::nvvm_suld_1d_array_i16_trap:
3584   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3585   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3586   case Intrinsic::nvvm_suld_2d_i16_trap:
3587   case Intrinsic::nvvm_suld_2d_v2i16_trap:
3588   case Intrinsic::nvvm_suld_2d_v4i16_trap:
3589   case Intrinsic::nvvm_suld_2d_array_i16_trap:
3590   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3591   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3592   case Intrinsic::nvvm_suld_3d_i16_trap:
3593   case Intrinsic::nvvm_suld_3d_v2i16_trap:
3594   case Intrinsic::nvvm_suld_3d_v4i16_trap:
3595   case Intrinsic::nvvm_suld_1d_i16_zero:
3596   case Intrinsic::nvvm_suld_1d_v2i16_zero:
3597   case Intrinsic::nvvm_suld_1d_v4i16_zero:
3598   case Intrinsic::nvvm_suld_1d_array_i16_zero:
3599   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3600   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3601   case Intrinsic::nvvm_suld_2d_i16_zero:
3602   case Intrinsic::nvvm_suld_2d_v2i16_zero:
3603   case Intrinsic::nvvm_suld_2d_v4i16_zero:
3604   case Intrinsic::nvvm_suld_2d_array_i16_zero:
3605   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3606   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3607   case Intrinsic::nvvm_suld_3d_i16_zero:
3608   case Intrinsic::nvvm_suld_3d_v2i16_zero:
3609   case Intrinsic::nvvm_suld_3d_v4i16_zero: {
3610     Info.opc = getOpcForSurfaceInstr(Intrinsic);
3611     Info.memVT = MVT::i16;
3612     Info.ptrVal = nullptr;
3613     Info.offset = 0;
3614     Info.vol = 0;
3615     Info.readMem = true;
3616     Info.writeMem = false;
3617     Info.align = 16;
3618     return true;
3619   }
3620   case Intrinsic::nvvm_suld_1d_i32_clamp:
3621   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3622   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3623   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3624   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3625   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3626   case Intrinsic::nvvm_suld_2d_i32_clamp:
3627   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3628   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3629   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3630   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3631   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3632   case Intrinsic::nvvm_suld_3d_i32_clamp:
3633   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3634   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3635   case Intrinsic::nvvm_suld_1d_i32_trap:
3636   case Intrinsic::nvvm_suld_1d_v2i32_trap:
3637   case Intrinsic::nvvm_suld_1d_v4i32_trap:
3638   case Intrinsic::nvvm_suld_1d_array_i32_trap:
3639   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3640   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3641   case Intrinsic::nvvm_suld_2d_i32_trap:
3642   case Intrinsic::nvvm_suld_2d_v2i32_trap:
3643   case Intrinsic::nvvm_suld_2d_v4i32_trap:
3644   case Intrinsic::nvvm_suld_2d_array_i32_trap:
3645   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3646   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3647   case Intrinsic::nvvm_suld_3d_i32_trap:
3648   case Intrinsic::nvvm_suld_3d_v2i32_trap:
3649   case Intrinsic::nvvm_suld_3d_v4i32_trap:
3650   case Intrinsic::nvvm_suld_1d_i32_zero:
3651   case Intrinsic::nvvm_suld_1d_v2i32_zero:
3652   case Intrinsic::nvvm_suld_1d_v4i32_zero:
3653   case Intrinsic::nvvm_suld_1d_array_i32_zero:
3654   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3655   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3656   case Intrinsic::nvvm_suld_2d_i32_zero:
3657   case Intrinsic::nvvm_suld_2d_v2i32_zero:
3658   case Intrinsic::nvvm_suld_2d_v4i32_zero:
3659   case Intrinsic::nvvm_suld_2d_array_i32_zero:
3660   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3661   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3662   case Intrinsic::nvvm_suld_3d_i32_zero:
3663   case Intrinsic::nvvm_suld_3d_v2i32_zero:
3664   case Intrinsic::nvvm_suld_3d_v4i32_zero: {
3665     Info.opc = getOpcForSurfaceInstr(Intrinsic);
3666     Info.memVT = MVT::i32;
3667     Info.ptrVal = nullptr;
3668     Info.offset = 0;
3669     Info.vol = 0;
3670     Info.readMem = true;
3671     Info.writeMem = false;
3672     Info.align = 16;
3673     return true;
3674   }
3675   case Intrinsic::nvvm_suld_1d_i64_clamp:
3676   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3677   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3678   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3679   case Intrinsic::nvvm_suld_2d_i64_clamp:
3680   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3681   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3682   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3683   case Intrinsic::nvvm_suld_3d_i64_clamp:
3684   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3685   case Intrinsic::nvvm_suld_1d_i64_trap:
3686   case Intrinsic::nvvm_suld_1d_v2i64_trap:
3687   case Intrinsic::nvvm_suld_1d_array_i64_trap:
3688   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3689   case Intrinsic::nvvm_suld_2d_i64_trap:
3690   case Intrinsic::nvvm_suld_2d_v2i64_trap:
3691   case Intrinsic::nvvm_suld_2d_array_i64_trap:
3692   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3693   case Intrinsic::nvvm_suld_3d_i64_trap:
3694   case Intrinsic::nvvm_suld_3d_v2i64_trap:
3695   case Intrinsic::nvvm_suld_1d_i64_zero:
3696   case Intrinsic::nvvm_suld_1d_v2i64_zero:
3697   case Intrinsic::nvvm_suld_1d_array_i64_zero:
3698   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3699   case Intrinsic::nvvm_suld_2d_i64_zero:
3700   case Intrinsic::nvvm_suld_2d_v2i64_zero:
3701   case Intrinsic::nvvm_suld_2d_array_i64_zero:
3702   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3703   case Intrinsic::nvvm_suld_3d_i64_zero:
3704   case Intrinsic::nvvm_suld_3d_v2i64_zero: {
3705     Info.opc = getOpcForSurfaceInstr(Intrinsic);
3706     Info.memVT = MVT::i64;
3707     Info.ptrVal = nullptr;
3708     Info.offset = 0;
3709     Info.vol = 0;
3710     Info.readMem = true;
3711     Info.writeMem = false;
3712     Info.align = 16;
3713     return true;
3714   }
3715   }
3716   return false;
3717 }
3718 
3719 /// isLegalAddressingMode - Return true if the addressing mode represented
3720 /// by AM is legal for this target, for a load/store of the specified type.
3721 /// Used to guide target specific optimizations, like loop strength reduction
3722 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
3723 /// (CodeGenPrepare.cpp)
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const3724 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
3725                                                 const AddrMode &AM, Type *Ty,
3726                                                 unsigned AS) const {
3727 
3728   // AddrMode - This represents an addressing mode of:
3729   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3730   //
3731   // The legal address modes are
3732   // - [avar]
3733   // - [areg]
3734   // - [areg+immoff]
3735   // - [immAddr]
3736 
3737   if (AM.BaseGV) {
3738     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3739   }
3740 
3741   switch (AM.Scale) {
3742   case 0: // "r", "r+i" or "i" is allowed
3743     break;
3744   case 1:
3745     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3746       return false;
3747     // Otherwise we have r+i.
3748     break;
3749   default:
3750     // No scale > 1 is allowed
3751     return false;
3752   }
3753   return true;
3754 }
3755 
3756 //===----------------------------------------------------------------------===//
3757 //                         NVPTX Inline Assembly Support
3758 //===----------------------------------------------------------------------===//
3759 
3760 /// getConstraintType - Given a constraint letter, return the type of
3761 /// constraint it is for this target.
3762 NVPTXTargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const3763 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
3764   if (Constraint.size() == 1) {
3765     switch (Constraint[0]) {
3766     default:
3767       break;
3768     case 'b':
3769     case 'r':
3770     case 'h':
3771     case 'c':
3772     case 'l':
3773     case 'f':
3774     case 'd':
3775     case '0':
3776     case 'N':
3777       return C_RegisterClass;
3778     }
3779   }
3780   return TargetLowering::getConstraintType(Constraint);
3781 }
3782 
3783 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI,StringRef Constraint,MVT VT) const3784 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3785                                                   StringRef Constraint,
3786                                                   MVT VT) const {
3787   if (Constraint.size() == 1) {
3788     switch (Constraint[0]) {
3789     case 'b':
3790       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
3791     case 'c':
3792       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3793     case 'h':
3794       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3795     case 'r':
3796       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
3797     case 'l':
3798     case 'N':
3799       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
3800     case 'f':
3801       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
3802     case 'd':
3803       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
3804     }
3805   }
3806   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3807 }
3808 
3809 //===----------------------------------------------------------------------===//
3810 //                         NVPTX DAG Combining
3811 //===----------------------------------------------------------------------===//
3812 
allowFMA(MachineFunction & MF,CodeGenOpt::Level OptLevel) const3813 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
3814                                    CodeGenOpt::Level OptLevel) const {
3815   const Function *F = MF.getFunction();
3816   const TargetOptions &TO = MF.getTarget().Options;
3817 
3818   // Always honor command-line argument
3819   if (FMAContractLevelOpt.getNumOccurrences() > 0) {
3820     return FMAContractLevelOpt > 0;
3821   } else if (OptLevel == 0) {
3822     // Do not contract if we're not optimizing the code
3823     return false;
3824   } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
3825     // Honor TargetOptions flags that explicitly say fusion is okay
3826     return true;
3827   } else if (F->hasFnAttribute("unsafe-fp-math")) {
3828     // Check for unsafe-fp-math=true coming from Clang
3829     Attribute Attr = F->getFnAttribute("unsafe-fp-math");
3830     StringRef Val = Attr.getValueAsString();
3831     if (Val == "true")
3832       return true;
3833   }
3834 
3835   // We did not have a clear indication that fusion is allowed, so assume not
3836   return false;
3837 }
3838 
3839 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
3840 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
3841 /// called with the default operands, and if that fails, with commuted
3842 /// operands.
PerformADDCombineWithOperands(SDNode * N,SDValue N0,SDValue N1,TargetLowering::DAGCombinerInfo & DCI,const NVPTXSubtarget & Subtarget,CodeGenOpt::Level OptLevel)3843 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
3844                                            TargetLowering::DAGCombinerInfo &DCI,
3845                                              const NVPTXSubtarget &Subtarget,
3846                                              CodeGenOpt::Level OptLevel) {
3847   SelectionDAG  &DAG = DCI.DAG;
3848   // Skip non-integer, non-scalar case
3849   EVT VT=N0.getValueType();
3850   if (VT.isVector())
3851     return SDValue();
3852 
3853   // fold (add (mul a, b), c) -> (mad a, b, c)
3854   //
3855   if (N0.getOpcode() == ISD::MUL) {
3856     assert (VT.isInteger());
3857     // For integer:
3858     // Since integer multiply-add costs the same as integer multiply
3859     // but is more costly than integer add, do the fusion only when
3860     // the mul is only used in the add.
3861     if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
3862         !N0.getNode()->hasOneUse())
3863       return SDValue();
3864 
3865     // Do the folding
3866     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
3867                        N0.getOperand(0), N0.getOperand(1), N1);
3868   }
3869   else if (N0.getOpcode() == ISD::FMUL) {
3870     if (VT == MVT::f32 || VT == MVT::f64) {
3871       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
3872           &DAG.getTargetLoweringInfo());
3873       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
3874         return SDValue();
3875 
3876       // For floating point:
3877       // Do the fusion only when the mul has less than 5 uses and all
3878       // are add.
3879       // The heuristic is that if a use is not an add, then that use
3880       // cannot be fused into fma, therefore mul is still needed anyway.
3881       // If there are more than 4 uses, even if they are all add, fusing
3882       // them will increase register pressue.
3883       //
3884       int numUses = 0;
3885       int nonAddCount = 0;
3886       for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
3887            UE = N0.getNode()->use_end();
3888            UI != UE; ++UI) {
3889         numUses++;
3890         SDNode *User = *UI;
3891         if (User->getOpcode() != ISD::FADD)
3892           ++nonAddCount;
3893       }
3894       if (numUses >= 5)
3895         return SDValue();
3896       if (nonAddCount) {
3897         int orderNo = N->getIROrder();
3898         int orderNo2 = N0.getNode()->getIROrder();
3899         // simple heuristics here for considering potential register
3900         // pressure, the logics here is that the differnce are used
3901         // to measure the distance between def and use, the longer distance
3902         // more likely cause register pressure.
3903         if (orderNo - orderNo2 < 500)
3904           return SDValue();
3905 
3906         // Now, check if at least one of the FMUL's operands is live beyond the node N,
3907         // which guarantees that the FMA will not increase register pressure at node N.
3908         bool opIsLive = false;
3909         const SDNode *left = N0.getOperand(0).getNode();
3910         const SDNode *right = N0.getOperand(1).getNode();
3911 
3912         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
3913           opIsLive = true;
3914 
3915         if (!opIsLive)
3916           for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
3917             SDNode *User = *UI;
3918             int orderNo3 = User->getIROrder();
3919             if (orderNo3 > orderNo) {
3920               opIsLive = true;
3921               break;
3922             }
3923           }
3924 
3925         if (!opIsLive)
3926           for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
3927             SDNode *User = *UI;
3928             int orderNo3 = User->getIROrder();
3929             if (orderNo3 > orderNo) {
3930               opIsLive = true;
3931               break;
3932             }
3933           }
3934 
3935         if (!opIsLive)
3936           return SDValue();
3937       }
3938 
3939       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
3940                          N0.getOperand(0), N0.getOperand(1), N1);
3941     }
3942   }
3943 
3944   return SDValue();
3945 }
3946 
3947 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
3948 ///
PerformADDCombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI,const NVPTXSubtarget & Subtarget,CodeGenOpt::Level OptLevel)3949 static SDValue PerformADDCombine(SDNode *N,
3950                                  TargetLowering::DAGCombinerInfo &DCI,
3951                                  const NVPTXSubtarget &Subtarget,
3952                                  CodeGenOpt::Level OptLevel) {
3953   SDValue N0 = N->getOperand(0);
3954   SDValue N1 = N->getOperand(1);
3955 
3956   // First try with the default operand order.
3957   if (SDValue Result =
3958           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
3959     return Result;
3960 
3961   // If that didn't work, try again with the operands commuted.
3962   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
3963 }
3964 
PerformANDCombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI)3965 static SDValue PerformANDCombine(SDNode *N,
3966                                  TargetLowering::DAGCombinerInfo &DCI) {
3967   // The type legalizer turns a vector load of i8 values into a zextload to i16
3968   // registers, optionally ANY_EXTENDs it (if target type is integer),
3969   // and ANDs off the high 8 bits. Since we turn this load into a
3970   // target-specific DAG node, the DAG combiner fails to eliminate these AND
3971   // nodes. Do that here.
3972   SDValue Val = N->getOperand(0);
3973   SDValue Mask = N->getOperand(1);
3974 
3975   if (isa<ConstantSDNode>(Val)) {
3976     std::swap(Val, Mask);
3977   }
3978 
3979   SDValue AExt;
3980   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
3981   if (Val.getOpcode() == ISD::ANY_EXTEND) {
3982     AExt = Val;
3983     Val = Val->getOperand(0);
3984   }
3985 
3986   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
3987     Val = Val->getOperand(0);
3988   }
3989 
3990   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
3991       Val->getOpcode() == NVPTXISD::LoadV4) {
3992     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
3993     if (!MaskCnst) {
3994       // Not an AND with a constant
3995       return SDValue();
3996     }
3997 
3998     uint64_t MaskVal = MaskCnst->getZExtValue();
3999     if (MaskVal != 0xff) {
4000       // Not an AND that chops off top 8 bits
4001       return SDValue();
4002     }
4003 
4004     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4005     if (!Mem) {
4006       // Not a MemSDNode?!?
4007       return SDValue();
4008     }
4009 
4010     EVT MemVT = Mem->getMemoryVT();
4011     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4012       // We only handle the i8 case
4013       return SDValue();
4014     }
4015 
4016     unsigned ExtType =
4017       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4018         getZExtValue();
4019     if (ExtType == ISD::SEXTLOAD) {
4020       // If for some reason the load is a sextload, the and is needed to zero
4021       // out the high 8 bits
4022       return SDValue();
4023     }
4024 
4025     bool AddTo = false;
4026     if (AExt.getNode() != 0) {
4027       // Re-insert the ext as a zext.
4028       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4029                             AExt.getValueType(), Val);
4030       AddTo = true;
4031     }
4032 
4033     // If we get here, the AND is unnecessary.  Just replace it with the load
4034     DCI.CombineTo(N, Val, AddTo);
4035   }
4036 
4037   return SDValue();
4038 }
4039 
PerformSELECTCombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI)4040 static SDValue PerformSELECTCombine(SDNode *N,
4041                                     TargetLowering::DAGCombinerInfo &DCI) {
4042   // Currently this detects patterns for integer min and max and
4043   // lowers them to PTX-specific intrinsics that enable hardware
4044   // support.
4045 
4046   const SDValue Cond = N->getOperand(0);
4047   if (Cond.getOpcode() != ISD::SETCC) return SDValue();
4048 
4049   const SDValue LHS = Cond.getOperand(0);
4050   const SDValue RHS = Cond.getOperand(1);
4051   const SDValue True = N->getOperand(1);
4052   const SDValue False = N->getOperand(2);
4053   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
4054     return SDValue();
4055 
4056   const EVT VT = N->getValueType(0);
4057   if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
4058 
4059   const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4060   SDValue Larger;  // The larger of LHS and RHS when condition is true.
4061   switch (CC) {
4062     case ISD::SETULT:
4063     case ISD::SETULE:
4064     case ISD::SETLT:
4065     case ISD::SETLE:
4066       Larger = RHS;
4067       break;
4068 
4069     case ISD::SETGT:
4070     case ISD::SETGE:
4071     case ISD::SETUGT:
4072     case ISD::SETUGE:
4073       Larger = LHS;
4074       break;
4075 
4076     default:
4077       return SDValue();
4078   }
4079   const bool IsMax = (Larger == True);
4080   const bool IsSigned = ISD::isSignedIntSetCC(CC);
4081 
4082   unsigned IntrinsicId;
4083   if (VT == MVT::i32) {
4084     if (IsSigned)
4085       IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
4086     else
4087       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
4088   } else {
4089     assert(VT == MVT::i64);
4090     if (IsSigned)
4091       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
4092     else
4093       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
4094   }
4095 
4096   SDLoc DL(N);
4097   return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
4098                          DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
4099 }
4100 
4101 enum OperandSignedness {
4102   Signed = 0,
4103   Unsigned,
4104   Unknown
4105 };
4106 
4107 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4108 /// that can be demoted to \p OptSize bits without loss of information. The
4109 /// signedness of the operand, if determinable, is placed in \p S.
IsMulWideOperandDemotable(SDValue Op,unsigned OptSize,OperandSignedness & S)4110 static bool IsMulWideOperandDemotable(SDValue Op,
4111                                       unsigned OptSize,
4112                                       OperandSignedness &S) {
4113   S = Unknown;
4114 
4115   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4116       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4117     EVT OrigVT = Op.getOperand(0).getValueType();
4118     if (OrigVT.getSizeInBits() <= OptSize) {
4119       S = Signed;
4120       return true;
4121     }
4122   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4123     EVT OrigVT = Op.getOperand(0).getValueType();
4124     if (OrigVT.getSizeInBits() <= OptSize) {
4125       S = Unsigned;
4126       return true;
4127     }
4128   }
4129 
4130   return false;
4131 }
4132 
4133 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4134 /// be demoted to \p OptSize bits without loss of information. If the operands
4135 /// contain a constant, it should appear as the RHS operand. The signedness of
4136 /// the operands is placed in \p IsSigned.
AreMulWideOperandsDemotable(SDValue LHS,SDValue RHS,unsigned OptSize,bool & IsSigned)4137 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
4138                                         unsigned OptSize,
4139                                         bool &IsSigned) {
4140 
4141   OperandSignedness LHSSign;
4142 
4143   // The LHS operand must be a demotable op
4144   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4145     return false;
4146 
4147   // We should have been able to determine the signedness from the LHS
4148   if (LHSSign == Unknown)
4149     return false;
4150 
4151   IsSigned = (LHSSign == Signed);
4152 
4153   // The RHS can be a demotable op or a constant
4154   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4155     const APInt &Val = CI->getAPIntValue();
4156     if (LHSSign == Unsigned) {
4157       return Val.isIntN(OptSize);
4158     } else {
4159       return Val.isSignedIntN(OptSize);
4160     }
4161   } else {
4162     OperandSignedness RHSSign;
4163     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4164       return false;
4165 
4166     return LHSSign == RHSSign;
4167   }
4168 }
4169 
4170 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4171 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4172 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4173 /// amount.
TryMULWIDECombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI)4174 static SDValue TryMULWIDECombine(SDNode *N,
4175                                  TargetLowering::DAGCombinerInfo &DCI) {
4176   EVT MulType = N->getValueType(0);
4177   if (MulType != MVT::i32 && MulType != MVT::i64) {
4178     return SDValue();
4179   }
4180 
4181   SDLoc DL(N);
4182   unsigned OptSize = MulType.getSizeInBits() >> 1;
4183   SDValue LHS = N->getOperand(0);
4184   SDValue RHS = N->getOperand(1);
4185 
4186   // Canonicalize the multiply so the constant (if any) is on the right
4187   if (N->getOpcode() == ISD::MUL) {
4188     if (isa<ConstantSDNode>(LHS)) {
4189       std::swap(LHS, RHS);
4190     }
4191   }
4192 
4193   // If we have a SHL, determine the actual multiply amount
4194   if (N->getOpcode() == ISD::SHL) {
4195     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4196     if (!ShlRHS) {
4197       return SDValue();
4198     }
4199 
4200     APInt ShiftAmt = ShlRHS->getAPIntValue();
4201     unsigned BitWidth = MulType.getSizeInBits();
4202     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4203       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4204       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4205     } else {
4206       return SDValue();
4207     }
4208   }
4209 
4210   bool Signed;
4211   // Verify that our operands are demotable
4212   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4213     return SDValue();
4214   }
4215 
4216   EVT DemotedVT;
4217   if (MulType == MVT::i32) {
4218     DemotedVT = MVT::i16;
4219   } else {
4220     DemotedVT = MVT::i32;
4221   }
4222 
4223   // Truncate the operands to the correct size. Note that these are just for
4224   // type consistency and will (likely) be eliminated in later phases.
4225   SDValue TruncLHS =
4226     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4227   SDValue TruncRHS =
4228     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4229 
4230   unsigned Opc;
4231   if (Signed) {
4232     Opc = NVPTXISD::MUL_WIDE_SIGNED;
4233   } else {
4234     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
4235   }
4236 
4237   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4238 }
4239 
4240 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
PerformMULCombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI,CodeGenOpt::Level OptLevel)4241 static SDValue PerformMULCombine(SDNode *N,
4242                                  TargetLowering::DAGCombinerInfo &DCI,
4243                                  CodeGenOpt::Level OptLevel) {
4244   if (OptLevel > 0) {
4245     // Try mul.wide combining at OptLevel > 0
4246     if (SDValue Ret = TryMULWIDECombine(N, DCI))
4247       return Ret;
4248   }
4249 
4250   return SDValue();
4251 }
4252 
4253 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
PerformSHLCombine(SDNode * N,TargetLowering::DAGCombinerInfo & DCI,CodeGenOpt::Level OptLevel)4254 static SDValue PerformSHLCombine(SDNode *N,
4255                                  TargetLowering::DAGCombinerInfo &DCI,
4256                                  CodeGenOpt::Level OptLevel) {
4257   if (OptLevel > 0) {
4258     // Try mul.wide combining at OptLevel > 0
4259     if (SDValue Ret = TryMULWIDECombine(N, DCI))
4260       return Ret;
4261   }
4262 
4263   return SDValue();
4264 }
4265 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const4266 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4267                                                DAGCombinerInfo &DCI) const {
4268   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
4269   switch (N->getOpcode()) {
4270     default: break;
4271     case ISD::ADD:
4272     case ISD::FADD:
4273       return PerformADDCombine(N, DCI, STI, OptLevel);
4274     case ISD::MUL:
4275       return PerformMULCombine(N, DCI, OptLevel);
4276     case ISD::SHL:
4277       return PerformSHLCombine(N, DCI, OptLevel);
4278     case ISD::AND:
4279       return PerformANDCombine(N, DCI);
4280     case ISD::SELECT:
4281       return PerformSELECTCombine(N, DCI);
4282   }
4283   return SDValue();
4284 }
4285 
4286 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
ReplaceLoadVector(SDNode * N,SelectionDAG & DAG,SmallVectorImpl<SDValue> & Results)4287 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
4288                               SmallVectorImpl<SDValue> &Results) {
4289   EVT ResVT = N->getValueType(0);
4290   SDLoc DL(N);
4291 
4292   assert(ResVT.isVector() && "Vector load must have vector type");
4293 
4294   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4295   // legal.  We can (and should) split that into 2 loads of <2 x double> here
4296   // but I'm leaving that as a TODO for now.
4297   assert(ResVT.isSimple() && "Can only handle simple types");
4298   switch (ResVT.getSimpleVT().SimpleTy) {
4299   default:
4300     return;
4301   case MVT::v2i8:
4302   case MVT::v2i16:
4303   case MVT::v2i32:
4304   case MVT::v2i64:
4305   case MVT::v2f32:
4306   case MVT::v2f64:
4307   case MVT::v4i8:
4308   case MVT::v4i16:
4309   case MVT::v4i32:
4310   case MVT::v4f32:
4311     // This is a "native" vector type
4312     break;
4313   }
4314 
4315   LoadSDNode *LD = cast<LoadSDNode>(N);
4316 
4317   unsigned Align = LD->getAlignment();
4318   auto &TD = DAG.getDataLayout();
4319   unsigned PrefAlign =
4320       TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4321   if (Align < PrefAlign) {
4322     // This load is not sufficiently aligned, so bail out and let this vector
4323     // load be scalarized.  Note that we may still be able to emit smaller
4324     // vector loads.  For example, if we are loading a <4 x float> with an
4325     // alignment of 8, this check will fail but the legalizer will try again
4326     // with 2 x <2 x float>, which will succeed with an alignment of 8.
4327     return;
4328   }
4329 
4330   EVT EltVT = ResVT.getVectorElementType();
4331   unsigned NumElts = ResVT.getVectorNumElements();
4332 
4333   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4334   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4335   // loaded type to i16 and propagate the "real" type as the memory type.
4336   bool NeedTrunc = false;
4337   if (EltVT.getSizeInBits() < 16) {
4338     EltVT = MVT::i16;
4339     NeedTrunc = true;
4340   }
4341 
4342   unsigned Opcode = 0;
4343   SDVTList LdResVTs;
4344 
4345   switch (NumElts) {
4346   default:
4347     return;
4348   case 2:
4349     Opcode = NVPTXISD::LoadV2;
4350     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4351     break;
4352   case 4: {
4353     Opcode = NVPTXISD::LoadV4;
4354     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4355     LdResVTs = DAG.getVTList(ListVTs);
4356     break;
4357   }
4358   }
4359 
4360   // Copy regular operands
4361   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4362 
4363   // The select routine does not have access to the LoadSDNode instance, so
4364   // pass along the extension information
4365   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4366 
4367   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4368                                           LD->getMemoryVT(),
4369                                           LD->getMemOperand());
4370 
4371   SmallVector<SDValue, 4> ScalarRes;
4372 
4373   for (unsigned i = 0; i < NumElts; ++i) {
4374     SDValue Res = NewLD.getValue(i);
4375     if (NeedTrunc)
4376       Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4377     ScalarRes.push_back(Res);
4378   }
4379 
4380   SDValue LoadChain = NewLD.getValue(NumElts);
4381 
4382   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4383 
4384   Results.push_back(BuildVec);
4385   Results.push_back(LoadChain);
4386 }
4387 
ReplaceINTRINSIC_W_CHAIN(SDNode * N,SelectionDAG & DAG,SmallVectorImpl<SDValue> & Results)4388 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
4389                                      SmallVectorImpl<SDValue> &Results) {
4390   SDValue Chain = N->getOperand(0);
4391   SDValue Intrin = N->getOperand(1);
4392   SDLoc DL(N);
4393 
4394   // Get the intrinsic ID
4395   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4396   switch (IntrinNo) {
4397   default:
4398     return;
4399   case Intrinsic::nvvm_ldg_global_i:
4400   case Intrinsic::nvvm_ldg_global_f:
4401   case Intrinsic::nvvm_ldg_global_p:
4402   case Intrinsic::nvvm_ldu_global_i:
4403   case Intrinsic::nvvm_ldu_global_f:
4404   case Intrinsic::nvvm_ldu_global_p: {
4405     EVT ResVT = N->getValueType(0);
4406 
4407     if (ResVT.isVector()) {
4408       // Vector LDG/LDU
4409 
4410       unsigned NumElts = ResVT.getVectorNumElements();
4411       EVT EltVT = ResVT.getVectorElementType();
4412 
4413       // Since LDU/LDG are target nodes, we cannot rely on DAG type
4414       // legalization.
4415       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4416       // loaded type to i16 and propagate the "real" type as the memory type.
4417       bool NeedTrunc = false;
4418       if (EltVT.getSizeInBits() < 16) {
4419         EltVT = MVT::i16;
4420         NeedTrunc = true;
4421       }
4422 
4423       unsigned Opcode = 0;
4424       SDVTList LdResVTs;
4425 
4426       switch (NumElts) {
4427       default:
4428         return;
4429       case 2:
4430         switch (IntrinNo) {
4431         default:
4432           return;
4433         case Intrinsic::nvvm_ldg_global_i:
4434         case Intrinsic::nvvm_ldg_global_f:
4435         case Intrinsic::nvvm_ldg_global_p:
4436           Opcode = NVPTXISD::LDGV2;
4437           break;
4438         case Intrinsic::nvvm_ldu_global_i:
4439         case Intrinsic::nvvm_ldu_global_f:
4440         case Intrinsic::nvvm_ldu_global_p:
4441           Opcode = NVPTXISD::LDUV2;
4442           break;
4443         }
4444         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4445         break;
4446       case 4: {
4447         switch (IntrinNo) {
4448         default:
4449           return;
4450         case Intrinsic::nvvm_ldg_global_i:
4451         case Intrinsic::nvvm_ldg_global_f:
4452         case Intrinsic::nvvm_ldg_global_p:
4453           Opcode = NVPTXISD::LDGV4;
4454           break;
4455         case Intrinsic::nvvm_ldu_global_i:
4456         case Intrinsic::nvvm_ldu_global_f:
4457         case Intrinsic::nvvm_ldu_global_p:
4458           Opcode = NVPTXISD::LDUV4;
4459           break;
4460         }
4461         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4462         LdResVTs = DAG.getVTList(ListVTs);
4463         break;
4464       }
4465       }
4466 
4467       SmallVector<SDValue, 8> OtherOps;
4468 
4469       // Copy regular operands
4470 
4471       OtherOps.push_back(Chain); // Chain
4472                                  // Skip operand 1 (intrinsic ID)
4473       // Others
4474       OtherOps.append(N->op_begin() + 2, N->op_end());
4475 
4476       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4477 
4478       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4479                                               MemSD->getMemoryVT(),
4480                                               MemSD->getMemOperand());
4481 
4482       SmallVector<SDValue, 4> ScalarRes;
4483 
4484       for (unsigned i = 0; i < NumElts; ++i) {
4485         SDValue Res = NewLD.getValue(i);
4486         if (NeedTrunc)
4487           Res =
4488               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4489         ScalarRes.push_back(Res);
4490       }
4491 
4492       SDValue LoadChain = NewLD.getValue(NumElts);
4493 
4494       SDValue BuildVec =
4495           DAG.getBuildVector(ResVT, DL, ScalarRes);
4496 
4497       Results.push_back(BuildVec);
4498       Results.push_back(LoadChain);
4499     } else {
4500       // i8 LDG/LDU
4501       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
4502              "Custom handling of non-i8 ldu/ldg?");
4503 
4504       // Just copy all operands as-is
4505       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
4506 
4507       // Force output to i16
4508       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
4509 
4510       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4511 
4512       // We make sure the memory type is i8, which will be used during isel
4513       // to select the proper instruction.
4514       SDValue NewLD =
4515           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
4516                                   MVT::i8, MemSD->getMemOperand());
4517 
4518       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
4519                                     NewLD.getValue(0)));
4520       Results.push_back(NewLD.getValue(1));
4521     }
4522   }
4523   }
4524 }
4525 
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const4526 void NVPTXTargetLowering::ReplaceNodeResults(
4527     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
4528   switch (N->getOpcode()) {
4529   default:
4530     report_fatal_error("Unhandled custom legalization");
4531   case ISD::LOAD:
4532     ReplaceLoadVector(N, DAG, Results);
4533     return;
4534   case ISD::INTRINSIC_W_CHAIN:
4535     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
4536     return;
4537   }
4538 }
4539 
4540 // Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
anchor()4541 void NVPTXSection::anchor() {}
4542 
~NVPTXTargetObjectFile()4543 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
4544   delete static_cast<NVPTXSection *>(TextSection);
4545   delete static_cast<NVPTXSection *>(DataSection);
4546   delete static_cast<NVPTXSection *>(BSSSection);
4547   delete static_cast<NVPTXSection *>(ReadOnlySection);
4548 
4549   delete static_cast<NVPTXSection *>(StaticCtorSection);
4550   delete static_cast<NVPTXSection *>(StaticDtorSection);
4551   delete static_cast<NVPTXSection *>(LSDASection);
4552   delete static_cast<NVPTXSection *>(EHFrameSection);
4553   delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
4554   delete static_cast<NVPTXSection *>(DwarfInfoSection);
4555   delete static_cast<NVPTXSection *>(DwarfLineSection);
4556   delete static_cast<NVPTXSection *>(DwarfFrameSection);
4557   delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
4558   delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
4559   delete static_cast<NVPTXSection *>(DwarfStrSection);
4560   delete static_cast<NVPTXSection *>(DwarfLocSection);
4561   delete static_cast<NVPTXSection *>(DwarfARangesSection);
4562   delete static_cast<NVPTXSection *>(DwarfRangesSection);
4563   delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
4564 }
4565 
4566 MCSection *
SelectSectionForGlobal(const GlobalValue * GV,SectionKind Kind,Mangler & Mang,const TargetMachine & TM) const4567 NVPTXTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
4568                                               SectionKind Kind, Mangler &Mang,
4569                                               const TargetMachine &TM) const {
4570   return getDataSection();
4571 }
4572