1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30 
31 using namespace llvm;
32 
R600TargetLowering(const TargetMachine & TM,const R600Subtarget & STI)33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
34                                        const R600Subtarget &STI)
35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
38   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
39   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
40   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
41   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
42 
43   computeRegisterProperties(STI.getRegisterInfo());
44 
45   // Legalize loads and stores to the private address space.
46   setOperationAction(ISD::LOAD, MVT::i32, Custom);
47   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
48   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
49 
50   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
51   // spaces, so it is custom lowered to handle those where it isn't.
52   for (MVT VT : MVT::integer_valuetypes()) {
53     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
54     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
55     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
56 
57     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
58     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
59     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
60 
61     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
62     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
63     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
64   }
65 
66   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
67   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
68   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
69   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
70 
71   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
72   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
73   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
74 
75 
76   setOperationAction(ISD::STORE, MVT::i8, Custom);
77   setOperationAction(ISD::STORE, MVT::i32, Custom);
78   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
79   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
80 
81   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
82   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
83 
84   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
85   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
86   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
87 
88   // Set condition code actions
89   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
90   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
91   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
92   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
93   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
94   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
95   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
96   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
97   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
98   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
99   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
100   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
101 
102   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
103   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
104   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
105   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
106 
107   setOperationAction(ISD::FCOS, MVT::f32, Custom);
108   setOperationAction(ISD::FSIN, MVT::f32, Custom);
109 
110   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
111   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
112 
113   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
114   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
115   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
116 
117   setOperationAction(ISD::FSUB, MVT::f32, Expand);
118 
119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
120   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
121 
122   setOperationAction(ISD::SETCC, MVT::i32, Expand);
123   setOperationAction(ISD::SETCC, MVT::f32, Expand);
124   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
125   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
126   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
127 
128   setOperationAction(ISD::SELECT, MVT::i32, Expand);
129   setOperationAction(ISD::SELECT, MVT::f32, Expand);
130   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
131   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
132 
133   // ADD, SUB overflow.
134   // TODO: turn these into Legal?
135   if (Subtarget->hasCARRY())
136     setOperationAction(ISD::UADDO, MVT::i32, Custom);
137 
138   if (Subtarget->hasBORROW())
139     setOperationAction(ISD::USUBO, MVT::i32, Custom);
140 
141   // Expand sign extension of vectors
142   if (!Subtarget->hasBFE())
143     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
144 
145   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
146   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
147 
148   if (!Subtarget->hasBFE())
149     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
150   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
151   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
152 
153   if (!Subtarget->hasBFE())
154     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
155   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
156   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
157 
158   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
161 
162   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
163 
164   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
165 
166   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
167   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
168   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
169   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
170 
171   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
172   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
173   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
174   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
175 
176   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
177   //  to be Legal/Custom in order to avoid library calls.
178   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
179   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
180   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
181 
182   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
183 
184   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
185   for (MVT VT : ScalarIntVTs) {
186     setOperationAction(ISD::ADDC, VT, Expand);
187     setOperationAction(ISD::SUBC, VT, Expand);
188     setOperationAction(ISD::ADDE, VT, Expand);
189     setOperationAction(ISD::SUBE, VT, Expand);
190   }
191 
192   setSchedulingPreference(Sched::Source);
193 
194 
195   setTargetDAGCombine(ISD::FP_ROUND);
196   setTargetDAGCombine(ISD::FP_TO_SINT);
197   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
198   setTargetDAGCombine(ISD::SELECT_CC);
199   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
200 }
201 
getSubtarget() const202 const R600Subtarget *R600TargetLowering::getSubtarget() const {
203   return static_cast<const R600Subtarget *>(Subtarget);
204 }
205 
isEOP(MachineBasicBlock::iterator I)206 static inline bool isEOP(MachineBasicBlock::iterator I) {
207   return std::next(I)->getOpcode() == AMDGPU::RETURN;
208 }
209 
210 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const211 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
212                                                 MachineBasicBlock *BB) const {
213   MachineFunction * MF = BB->getParent();
214   MachineRegisterInfo &MRI = MF->getRegInfo();
215   MachineBasicBlock::iterator I = MI;
216   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
217 
218   switch (MI.getOpcode()) {
219   default:
220     // Replace LDS_*_RET instruction that don't have any uses with the
221     // equivalent LDS_*_NORET instruction.
222     if (TII->isLDSRetInstr(MI.getOpcode())) {
223       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
224       assert(DstIdx != -1);
225       MachineInstrBuilder NewMI;
226       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
227       //        LDS_1A2D support and remove this special case.
228       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
229           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
230         return BB;
231 
232       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
233                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
234       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
235         NewMI.addOperand(MI.getOperand(i));
236       }
237     } else {
238       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
239     }
240     break;
241   case AMDGPU::CLAMP_R600: {
242     MachineInstr *NewMI = TII->buildDefaultInstruction(
243         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
244         MI.getOperand(1).getReg());
245     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
246     break;
247   }
248 
249   case AMDGPU::FABS_R600: {
250     MachineInstr *NewMI = TII->buildDefaultInstruction(
251         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
252         MI.getOperand(1).getReg());
253     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
254     break;
255   }
256 
257   case AMDGPU::FNEG_R600: {
258     MachineInstr *NewMI = TII->buildDefaultInstruction(
259         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
260         MI.getOperand(1).getReg());
261     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
262     break;
263   }
264 
265   case AMDGPU::MASK_WRITE: {
266     unsigned maskedRegister = MI.getOperand(0).getReg();
267     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
268     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
269     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
270     break;
271   }
272 
273   case AMDGPU::MOV_IMM_F32:
274     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
275                                                             .getFPImm()
276                                                             ->getValueAPF()
277                                                             .bitcastToAPInt()
278                                                             .getZExtValue());
279     break;
280   case AMDGPU::MOV_IMM_I32:
281     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
282                      MI.getOperand(1).getImm());
283     break;
284   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
285     //TODO: Perhaps combine this instruction with the next if possible
286     auto MIB = TII->buildDefaultInstruction(
287         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
288     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
289     //TODO: Ugh this is rather ugly
290     MIB->getOperand(Idx) = MI.getOperand(1);
291     break;
292   }
293   case AMDGPU::CONST_COPY: {
294     MachineInstr *NewMI = TII->buildDefaultInstruction(
295         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
296     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
297                        MI.getOperand(1).getImm());
298     break;
299   }
300 
301   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
302   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
303   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
304     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
305         .addOperand(MI.getOperand(0))
306         .addOperand(MI.getOperand(1))
307         .addImm(isEOP(I)); // Set End of program bit
308     break;
309   }
310   case AMDGPU::RAT_STORE_TYPED_eg: {
311     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
312         .addOperand(MI.getOperand(0))
313         .addOperand(MI.getOperand(1))
314         .addOperand(MI.getOperand(2))
315         .addImm(isEOP(I)); // Set End of program bit
316     break;
317   }
318 
319   case AMDGPU::TXD: {
320     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
321     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322     MachineOperand &RID = MI.getOperand(4);
323     MachineOperand &SID = MI.getOperand(5);
324     unsigned TextureId = MI.getOperand(6).getImm();
325     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
326     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
327 
328     switch (TextureId) {
329     case 5: // Rect
330       CTX = CTY = 0;
331       break;
332     case 6: // Shadow1D
333       SrcW = SrcZ;
334       break;
335     case 7: // Shadow2D
336       SrcW = SrcZ;
337       break;
338     case 8: // ShadowRect
339       CTX = CTY = 0;
340       SrcW = SrcZ;
341       break;
342     case 9: // 1DArray
343       SrcZ = SrcY;
344       CTZ = 0;
345       break;
346     case 10: // 2DArray
347       CTZ = 0;
348       break;
349     case 11: // Shadow1DArray
350       SrcZ = SrcY;
351       CTZ = 0;
352       break;
353     case 12: // Shadow2DArray
354       CTZ = 0;
355       break;
356     }
357     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
358             T0)
359         .addOperand(MI.getOperand(3))
360         .addImm(SrcX)
361         .addImm(SrcY)
362         .addImm(SrcZ)
363         .addImm(SrcW)
364         .addImm(0)
365         .addImm(0)
366         .addImm(0)
367         .addImm(0)
368         .addImm(1)
369         .addImm(2)
370         .addImm(3)
371         .addOperand(RID)
372         .addOperand(SID)
373         .addImm(CTX)
374         .addImm(CTY)
375         .addImm(CTZ)
376         .addImm(CTW);
377     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
378             T1)
379         .addOperand(MI.getOperand(2))
380         .addImm(SrcX)
381         .addImm(SrcY)
382         .addImm(SrcZ)
383         .addImm(SrcW)
384         .addImm(0)
385         .addImm(0)
386         .addImm(0)
387         .addImm(0)
388         .addImm(1)
389         .addImm(2)
390         .addImm(3)
391         .addOperand(RID)
392         .addOperand(SID)
393         .addImm(CTX)
394         .addImm(CTY)
395         .addImm(CTZ)
396         .addImm(CTW);
397     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
398         .addOperand(MI.getOperand(0))
399         .addOperand(MI.getOperand(1))
400         .addImm(SrcX)
401         .addImm(SrcY)
402         .addImm(SrcZ)
403         .addImm(SrcW)
404         .addImm(0)
405         .addImm(0)
406         .addImm(0)
407         .addImm(0)
408         .addImm(1)
409         .addImm(2)
410         .addImm(3)
411         .addOperand(RID)
412         .addOperand(SID)
413         .addImm(CTX)
414         .addImm(CTY)
415         .addImm(CTZ)
416         .addImm(CTW)
417         .addReg(T0, RegState::Implicit)
418         .addReg(T1, RegState::Implicit);
419     break;
420   }
421 
422   case AMDGPU::TXD_SHADOW: {
423     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
424     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
425     MachineOperand &RID = MI.getOperand(4);
426     MachineOperand &SID = MI.getOperand(5);
427     unsigned TextureId = MI.getOperand(6).getImm();
428     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
429     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
430 
431     switch (TextureId) {
432     case 5: // Rect
433       CTX = CTY = 0;
434       break;
435     case 6: // Shadow1D
436       SrcW = SrcZ;
437       break;
438     case 7: // Shadow2D
439       SrcW = SrcZ;
440       break;
441     case 8: // ShadowRect
442       CTX = CTY = 0;
443       SrcW = SrcZ;
444       break;
445     case 9: // 1DArray
446       SrcZ = SrcY;
447       CTZ = 0;
448       break;
449     case 10: // 2DArray
450       CTZ = 0;
451       break;
452     case 11: // Shadow1DArray
453       SrcZ = SrcY;
454       CTZ = 0;
455       break;
456     case 12: // Shadow2DArray
457       CTZ = 0;
458       break;
459     }
460 
461     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
462             T0)
463         .addOperand(MI.getOperand(3))
464         .addImm(SrcX)
465         .addImm(SrcY)
466         .addImm(SrcZ)
467         .addImm(SrcW)
468         .addImm(0)
469         .addImm(0)
470         .addImm(0)
471         .addImm(0)
472         .addImm(1)
473         .addImm(2)
474         .addImm(3)
475         .addOperand(RID)
476         .addOperand(SID)
477         .addImm(CTX)
478         .addImm(CTY)
479         .addImm(CTZ)
480         .addImm(CTW);
481     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
482             T1)
483         .addOperand(MI.getOperand(2))
484         .addImm(SrcX)
485         .addImm(SrcY)
486         .addImm(SrcZ)
487         .addImm(SrcW)
488         .addImm(0)
489         .addImm(0)
490         .addImm(0)
491         .addImm(0)
492         .addImm(1)
493         .addImm(2)
494         .addImm(3)
495         .addOperand(RID)
496         .addOperand(SID)
497         .addImm(CTX)
498         .addImm(CTY)
499         .addImm(CTZ)
500         .addImm(CTW);
501     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
502         .addOperand(MI.getOperand(0))
503         .addOperand(MI.getOperand(1))
504         .addImm(SrcX)
505         .addImm(SrcY)
506         .addImm(SrcZ)
507         .addImm(SrcW)
508         .addImm(0)
509         .addImm(0)
510         .addImm(0)
511         .addImm(0)
512         .addImm(1)
513         .addImm(2)
514         .addImm(3)
515         .addOperand(RID)
516         .addOperand(SID)
517         .addImm(CTX)
518         .addImm(CTY)
519         .addImm(CTZ)
520         .addImm(CTW)
521         .addReg(T0, RegState::Implicit)
522         .addReg(T1, RegState::Implicit);
523     break;
524   }
525 
526   case AMDGPU::BRANCH:
527     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
528         .addOperand(MI.getOperand(0));
529     break;
530 
531   case AMDGPU::BRANCH_COND_f32: {
532     MachineInstr *NewMI =
533         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
534                 AMDGPU::PREDICATE_BIT)
535             .addOperand(MI.getOperand(1))
536             .addImm(OPCODE_IS_NOT_ZERO)
537             .addImm(0); // Flags
538     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
539     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
540         .addOperand(MI.getOperand(0))
541         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
542     break;
543   }
544 
545   case AMDGPU::BRANCH_COND_i32: {
546     MachineInstr *NewMI =
547         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
548                 AMDGPU::PREDICATE_BIT)
549             .addOperand(MI.getOperand(1))
550             .addImm(OPCODE_IS_NOT_ZERO_INT)
551             .addImm(0); // Flags
552     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
553     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
554         .addOperand(MI.getOperand(0))
555         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
556     break;
557   }
558 
559   case AMDGPU::EG_ExportSwz:
560   case AMDGPU::R600_ExportSwz: {
561     // Instruction is left unmodified if its not the last one of its type
562     bool isLastInstructionOfItsType = true;
563     unsigned InstExportType = MI.getOperand(1).getImm();
564     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
565          EndBlock = BB->end(); NextExportInst != EndBlock;
566          NextExportInst = std::next(NextExportInst)) {
567       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
568           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
569         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
570             .getImm();
571         if (CurrentInstExportType == InstExportType) {
572           isLastInstructionOfItsType = false;
573           break;
574         }
575       }
576     }
577     bool EOP = isEOP(I);
578     if (!EOP && !isLastInstructionOfItsType)
579       return BB;
580     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
581     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
582         .addOperand(MI.getOperand(0))
583         .addOperand(MI.getOperand(1))
584         .addOperand(MI.getOperand(2))
585         .addOperand(MI.getOperand(3))
586         .addOperand(MI.getOperand(4))
587         .addOperand(MI.getOperand(5))
588         .addOperand(MI.getOperand(6))
589         .addImm(CfInst)
590         .addImm(EOP);
591     break;
592   }
593   case AMDGPU::RETURN: {
594     // RETURN instructions must have the live-out registers as implicit uses,
595     // otherwise they appear dead.
596     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
597     MachineInstrBuilder MIB(*MF, MI);
598     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
599       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
600     return BB;
601   }
602   }
603 
604   MI.eraseFromParent();
605   return BB;
606 }
607 
608 //===----------------------------------------------------------------------===//
609 // Custom DAG Lowering Operations
610 //===----------------------------------------------------------------------===//
611 
LowerOperation(SDValue Op,SelectionDAG & DAG) const612 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
613   MachineFunction &MF = DAG.getMachineFunction();
614   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
615   switch (Op.getOpcode()) {
616   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
617   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
618   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
619   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
620   case ISD::SRA_PARTS:
621   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
622   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
623   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
624   case ISD::FCOS:
625   case ISD::FSIN: return LowerTrig(Op, DAG);
626   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
627   case ISD::STORE: return LowerSTORE(Op, DAG);
628   case ISD::LOAD: {
629     SDValue Result = LowerLOAD(Op, DAG);
630     assert((!Result.getNode() ||
631             Result.getNode()->getNumValues() == 2) &&
632            "Load should return a value and a chain");
633     return Result;
634   }
635 
636   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
637   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
638   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
639   case ISD::INTRINSIC_VOID: {
640     SDValue Chain = Op.getOperand(0);
641     unsigned IntrinsicID =
642                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
643     switch (IntrinsicID) {
644     case AMDGPUIntrinsic::R600_store_swizzle: {
645       SDLoc DL(Op);
646       const SDValue Args[8] = {
647         Chain,
648         Op.getOperand(2), // Export Value
649         Op.getOperand(3), // ArrayBase
650         Op.getOperand(4), // Type
651         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
652         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
653         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
654         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
655       };
656       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
657     }
658 
659     // default for switch(IntrinsicID)
660     default: break;
661     }
662     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
663     break;
664   }
665   case ISD::INTRINSIC_WO_CHAIN: {
666     unsigned IntrinsicID =
667                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
668     EVT VT = Op.getValueType();
669     SDLoc DL(Op);
670     switch(IntrinsicID) {
671     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
672     case AMDGPUIntrinsic::r600_tex:
673     case AMDGPUIntrinsic::r600_texc:
674     case AMDGPUIntrinsic::r600_txl:
675     case AMDGPUIntrinsic::r600_txlc:
676     case AMDGPUIntrinsic::r600_txb:
677     case AMDGPUIntrinsic::r600_txbc:
678     case AMDGPUIntrinsic::r600_txf:
679     case AMDGPUIntrinsic::r600_txq:
680     case AMDGPUIntrinsic::r600_ddx:
681     case AMDGPUIntrinsic::r600_ddy: {
682       unsigned TextureOp;
683       switch (IntrinsicID) {
684       case AMDGPUIntrinsic::r600_tex:
685         TextureOp = 0;
686         break;
687       case AMDGPUIntrinsic::r600_texc:
688         TextureOp = 1;
689         break;
690       case AMDGPUIntrinsic::r600_txl:
691         TextureOp = 2;
692         break;
693       case AMDGPUIntrinsic::r600_txlc:
694         TextureOp = 3;
695         break;
696       case AMDGPUIntrinsic::r600_txb:
697         TextureOp = 4;
698         break;
699       case AMDGPUIntrinsic::r600_txbc:
700         TextureOp = 5;
701         break;
702       case AMDGPUIntrinsic::r600_txf:
703         TextureOp = 6;
704         break;
705       case AMDGPUIntrinsic::r600_txq:
706         TextureOp = 7;
707         break;
708       case AMDGPUIntrinsic::r600_ddx:
709         TextureOp = 8;
710         break;
711       case AMDGPUIntrinsic::r600_ddy:
712         TextureOp = 9;
713         break;
714       default:
715         llvm_unreachable("Unknow Texture Operation");
716       }
717 
718       SDValue TexArgs[19] = {
719         DAG.getConstant(TextureOp, DL, MVT::i32),
720         Op.getOperand(1),
721         DAG.getConstant(0, DL, MVT::i32),
722         DAG.getConstant(1, DL, MVT::i32),
723         DAG.getConstant(2, DL, MVT::i32),
724         DAG.getConstant(3, DL, MVT::i32),
725         Op.getOperand(2),
726         Op.getOperand(3),
727         Op.getOperand(4),
728         DAG.getConstant(0, DL, MVT::i32),
729         DAG.getConstant(1, DL, MVT::i32),
730         DAG.getConstant(2, DL, MVT::i32),
731         DAG.getConstant(3, DL, MVT::i32),
732         Op.getOperand(5),
733         Op.getOperand(6),
734         Op.getOperand(7),
735         Op.getOperand(8),
736         Op.getOperand(9),
737         Op.getOperand(10)
738       };
739       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
740     }
741     case AMDGPUIntrinsic::r600_dot4: {
742       SDValue Args[8] = {
743       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
744           DAG.getConstant(0, DL, MVT::i32)),
745       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
746           DAG.getConstant(0, DL, MVT::i32)),
747       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
748           DAG.getConstant(1, DL, MVT::i32)),
749       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
750           DAG.getConstant(1, DL, MVT::i32)),
751       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
752           DAG.getConstant(2, DL, MVT::i32)),
753       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
754           DAG.getConstant(2, DL, MVT::i32)),
755       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
756           DAG.getConstant(3, DL, MVT::i32)),
757       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
758           DAG.getConstant(3, DL, MVT::i32))
759       };
760       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
761     }
762 
763     case Intrinsic::r600_implicitarg_ptr: {
764       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
765       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
766       return DAG.getConstant(ByteOffset, DL, PtrVT);
767     }
768     case Intrinsic::r600_read_ngroups_x:
769       return LowerImplicitParameter(DAG, VT, DL, 0);
770     case Intrinsic::r600_read_ngroups_y:
771       return LowerImplicitParameter(DAG, VT, DL, 1);
772     case Intrinsic::r600_read_ngroups_z:
773       return LowerImplicitParameter(DAG, VT, DL, 2);
774     case Intrinsic::r600_read_global_size_x:
775       return LowerImplicitParameter(DAG, VT, DL, 3);
776     case Intrinsic::r600_read_global_size_y:
777       return LowerImplicitParameter(DAG, VT, DL, 4);
778     case Intrinsic::r600_read_global_size_z:
779       return LowerImplicitParameter(DAG, VT, DL, 5);
780     case Intrinsic::r600_read_local_size_x:
781       return LowerImplicitParameter(DAG, VT, DL, 6);
782     case Intrinsic::r600_read_local_size_y:
783       return LowerImplicitParameter(DAG, VT, DL, 7);
784     case Intrinsic::r600_read_local_size_z:
785       return LowerImplicitParameter(DAG, VT, DL, 8);
786 
787     case Intrinsic::r600_read_workdim:
788     case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
789       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
790       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
791     }
792 
793     case Intrinsic::r600_read_tgid_x:
794       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
795                                   AMDGPU::T1_X, VT);
796     case Intrinsic::r600_read_tgid_y:
797       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
798                                   AMDGPU::T1_Y, VT);
799     case Intrinsic::r600_read_tgid_z:
800       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
801                                   AMDGPU::T1_Z, VT);
802     case Intrinsic::r600_read_tidig_x:
803       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
804                                   AMDGPU::T0_X, VT);
805     case Intrinsic::r600_read_tidig_y:
806       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
807                                   AMDGPU::T0_Y, VT);
808     case Intrinsic::r600_read_tidig_z:
809       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
810                                   AMDGPU::T0_Z, VT);
811 
812     // FIXME: Should be renamed to r600 prefix
813     case AMDGPUIntrinsic::AMDGPU_rsq_clamped:
814       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
815 
816     case Intrinsic::r600_rsq:
817     case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
818       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
819       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
820     }
821     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
822     break;
823   }
824   } // end switch(Op.getOpcode())
825   return SDValue();
826 }
827 
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const828 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
829                                             SmallVectorImpl<SDValue> &Results,
830                                             SelectionDAG &DAG) const {
831   switch (N->getOpcode()) {
832   default:
833     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
834     return;
835   case ISD::FP_TO_UINT:
836     if (N->getValueType(0) == MVT::i1) {
837       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
838       return;
839     }
840     // Fall-through. Since we don't care about out of bounds values
841     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
842     // considers some extra cases which are not necessary here.
843   case ISD::FP_TO_SINT: {
844     SDValue Result;
845     if (expandFP_TO_SINT(N, Result, DAG))
846       Results.push_back(Result);
847     return;
848   }
849   case ISD::SDIVREM: {
850     SDValue Op = SDValue(N, 1);
851     SDValue RES = LowerSDIVREM(Op, DAG);
852     Results.push_back(RES);
853     Results.push_back(RES.getValue(1));
854     break;
855   }
856   case ISD::UDIVREM: {
857     SDValue Op = SDValue(N, 0);
858     LowerUDIVREM64(Op, DAG, Results);
859     break;
860   }
861   }
862 }
863 
vectorToVerticalVector(SelectionDAG & DAG,SDValue Vector) const864 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
865                                                    SDValue Vector) const {
866 
867   SDLoc DL(Vector);
868   EVT VecVT = Vector.getValueType();
869   EVT EltVT = VecVT.getVectorElementType();
870   SmallVector<SDValue, 8> Args;
871 
872   for (unsigned i = 0, e = VecVT.getVectorNumElements();
873                                                            i != e; ++i) {
874     Args.push_back(DAG.getNode(
875         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
876         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
877   }
878 
879   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
880 }
881 
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const882 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
883                                                     SelectionDAG &DAG) const {
884 
885   SDLoc DL(Op);
886   SDValue Vector = Op.getOperand(0);
887   SDValue Index = Op.getOperand(1);
888 
889   if (isa<ConstantSDNode>(Index) ||
890       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
891     return Op;
892 
893   Vector = vectorToVerticalVector(DAG, Vector);
894   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
895                      Vector, Index);
896 }
897 
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const898 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
899                                                    SelectionDAG &DAG) const {
900   SDLoc DL(Op);
901   SDValue Vector = Op.getOperand(0);
902   SDValue Value = Op.getOperand(1);
903   SDValue Index = Op.getOperand(2);
904 
905   if (isa<ConstantSDNode>(Index) ||
906       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
907     return Op;
908 
909   Vector = vectorToVerticalVector(DAG, Vector);
910   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
911                                Vector, Value, Index);
912   return vectorToVerticalVector(DAG, Insert);
913 }
914 
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const915 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
916                                                SDValue Op,
917                                                SelectionDAG &DAG) const {
918 
919   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
920   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
921     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
922 
923   const DataLayout &DL = DAG.getDataLayout();
924   const GlobalValue *GV = GSD->getGlobal();
925   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
926 
927   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
928   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
929 }
930 
LowerTrig(SDValue Op,SelectionDAG & DAG) const931 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
932   // On hw >= R700, COS/SIN input must be between -1. and 1.
933   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
934   EVT VT = Op.getValueType();
935   SDValue Arg = Op.getOperand(0);
936   SDLoc DL(Op);
937 
938   // TODO: Should this propagate fast-math-flags?
939   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
940       DAG.getNode(ISD::FADD, DL, VT,
941         DAG.getNode(ISD::FMUL, DL, VT, Arg,
942           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
943         DAG.getConstantFP(0.5, DL, MVT::f32)));
944   unsigned TrigNode;
945   switch (Op.getOpcode()) {
946   case ISD::FCOS:
947     TrigNode = AMDGPUISD::COS_HW;
948     break;
949   case ISD::FSIN:
950     TrigNode = AMDGPUISD::SIN_HW;
951     break;
952   default:
953     llvm_unreachable("Wrong trig opcode");
954   }
955   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
956       DAG.getNode(ISD::FADD, DL, VT, FractPart,
957         DAG.getConstantFP(-0.5, DL, MVT::f32)));
958   if (Gen >= R600Subtarget::R700)
959     return TrigVal;
960   // On R600 hw, COS/SIN input must be between -Pi and Pi.
961   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
962       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
963 }
964 
LowerSHLParts(SDValue Op,SelectionDAG & DAG) const965 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
966   SDLoc DL(Op);
967   EVT VT = Op.getValueType();
968 
969   SDValue Lo = Op.getOperand(0);
970   SDValue Hi = Op.getOperand(1);
971   SDValue Shift = Op.getOperand(2);
972   SDValue Zero = DAG.getConstant(0, DL, VT);
973   SDValue One  = DAG.getConstant(1, DL, VT);
974 
975   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
976   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
977   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
978   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
979 
980   // The dance around Width1 is necessary for 0 special case.
981   // Without it the CompShift might be 32, producing incorrect results in
982   // Overflow. So we do the shift in two steps, the alternative is to
983   // add a conditional to filter the special case.
984 
985   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
986   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
987 
988   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
989   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
990   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
991 
992   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
993   SDValue LoBig = Zero;
994 
995   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
996   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
997 
998   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
999 }
1000 
LowerSRXParts(SDValue Op,SelectionDAG & DAG) const1001 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1002   SDLoc DL(Op);
1003   EVT VT = Op.getValueType();
1004 
1005   SDValue Lo = Op.getOperand(0);
1006   SDValue Hi = Op.getOperand(1);
1007   SDValue Shift = Op.getOperand(2);
1008   SDValue Zero = DAG.getConstant(0, DL, VT);
1009   SDValue One  = DAG.getConstant(1, DL, VT);
1010 
1011   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1012 
1013   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1014   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1015   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1016   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1017 
1018   // The dance around Width1 is necessary for 0 special case.
1019   // Without it the CompShift might be 32, producing incorrect results in
1020   // Overflow. So we do the shift in two steps, the alternative is to
1021   // add a conditional to filter the special case.
1022 
1023   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1024   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1025 
1026   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1027   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1028   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1029 
1030   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1031   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1032 
1033   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1034   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1035 
1036   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1037 }
1038 
LowerUADDSUBO(SDValue Op,SelectionDAG & DAG,unsigned mainop,unsigned ovf) const1039 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1040                                           unsigned mainop, unsigned ovf) const {
1041   SDLoc DL(Op);
1042   EVT VT = Op.getValueType();
1043 
1044   SDValue Lo = Op.getOperand(0);
1045   SDValue Hi = Op.getOperand(1);
1046 
1047   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1048   // Extend sign.
1049   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1050                     DAG.getValueType(MVT::i1));
1051 
1052   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1053 
1054   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1055 }
1056 
LowerFPTOUINT(SDValue Op,SelectionDAG & DAG) const1057 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1058   SDLoc DL(Op);
1059   return DAG.getNode(
1060       ISD::SETCC,
1061       DL,
1062       MVT::i1,
1063       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1064       DAG.getCondCode(ISD::SETNE)
1065       );
1066 }
1067 
LowerImplicitParameter(SelectionDAG & DAG,EVT VT,const SDLoc & DL,unsigned DwordOffset) const1068 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1069                                                    const SDLoc &DL,
1070                                                    unsigned DwordOffset) const {
1071   unsigned ByteOffset = DwordOffset * 4;
1072   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1073                                       AMDGPUAS::CONSTANT_BUFFER_0);
1074 
1075   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1076   assert(isInt<16>(ByteOffset));
1077 
1078   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1079                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1080                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1081                      false, false, false, 0);
1082 }
1083 
isZero(SDValue Op) const1084 bool R600TargetLowering::isZero(SDValue Op) const {
1085   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1086     return Cst->isNullValue();
1087   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1088     return CstFP->isZero();
1089   } else {
1090     return false;
1091   }
1092 }
1093 
isHWTrueValue(SDValue Op) const1094 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1095   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1096     return CFP->isExactlyValue(1.0);
1097   }
1098   return isAllOnesConstant(Op);
1099 }
1100 
isHWFalseValue(SDValue Op) const1101 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1102   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1103     return CFP->getValueAPF().isZero();
1104   }
1105   return isNullConstant(Op);
1106 }
1107 
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const1108 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1109   SDLoc DL(Op);
1110   EVT VT = Op.getValueType();
1111 
1112   SDValue LHS = Op.getOperand(0);
1113   SDValue RHS = Op.getOperand(1);
1114   SDValue True = Op.getOperand(2);
1115   SDValue False = Op.getOperand(3);
1116   SDValue CC = Op.getOperand(4);
1117   SDValue Temp;
1118 
1119   if (VT == MVT::f32) {
1120     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1121     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1122     if (MinMax)
1123       return MinMax;
1124   }
1125 
1126   // LHS and RHS are guaranteed to be the same value type
1127   EVT CompareVT = LHS.getValueType();
1128 
1129   // Check if we can lower this to a native operation.
1130 
1131   // Try to lower to a SET* instruction:
1132   //
1133   // SET* can match the following patterns:
1134   //
1135   // select_cc f32, f32, -1,  0, cc_supported
1136   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1137   // select_cc i32, i32, -1,  0, cc_supported
1138   //
1139 
1140   // Move hardware True/False values to the correct operand.
1141   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1142   ISD::CondCode InverseCC =
1143      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1144   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1145     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1146       std::swap(False, True);
1147       CC = DAG.getCondCode(InverseCC);
1148     } else {
1149       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1150       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1151         std::swap(False, True);
1152         std::swap(LHS, RHS);
1153         CC = DAG.getCondCode(SwapInvCC);
1154       }
1155     }
1156   }
1157 
1158   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1159       (CompareVT == VT || VT == MVT::i32)) {
1160     // This can be matched by a SET* instruction.
1161     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1162   }
1163 
1164   // Try to lower to a CND* instruction:
1165   //
1166   // CND* can match the following patterns:
1167   //
1168   // select_cc f32, 0.0, f32, f32, cc_supported
1169   // select_cc f32, 0.0, i32, i32, cc_supported
1170   // select_cc i32, 0,   f32, f32, cc_supported
1171   // select_cc i32, 0,   i32, i32, cc_supported
1172   //
1173 
1174   // Try to move the zero value to the RHS
1175   if (isZero(LHS)) {
1176     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1177     // Try swapping the operands
1178     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1179     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1180       std::swap(LHS, RHS);
1181       CC = DAG.getCondCode(CCSwapped);
1182     } else {
1183       // Try inverting the conditon and then swapping the operands
1184       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1185       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1186       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1187         std::swap(True, False);
1188         std::swap(LHS, RHS);
1189         CC = DAG.getCondCode(CCSwapped);
1190       }
1191     }
1192   }
1193   if (isZero(RHS)) {
1194     SDValue Cond = LHS;
1195     SDValue Zero = RHS;
1196     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1197     if (CompareVT != VT) {
1198       // Bitcast True / False to the correct types.  This will end up being
1199       // a nop, but it allows us to define only a single pattern in the
1200       // .TD files for each CND* instruction rather than having to have
1201       // one pattern for integer True/False and one for fp True/False
1202       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1203       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1204     }
1205 
1206     switch (CCOpcode) {
1207     case ISD::SETONE:
1208     case ISD::SETUNE:
1209     case ISD::SETNE:
1210       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1211       Temp = True;
1212       True = False;
1213       False = Temp;
1214       break;
1215     default:
1216       break;
1217     }
1218     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1219         Cond, Zero,
1220         True, False,
1221         DAG.getCondCode(CCOpcode));
1222     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1223   }
1224 
1225   // If we make it this for it means we have no native instructions to handle
1226   // this SELECT_CC, so we must lower it.
1227   SDValue HWTrue, HWFalse;
1228 
1229   if (CompareVT == MVT::f32) {
1230     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1231     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1232   } else if (CompareVT == MVT::i32) {
1233     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1234     HWFalse = DAG.getConstant(0, DL, CompareVT);
1235   }
1236   else {
1237     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1238   }
1239 
1240   // Lower this unsupported SELECT_CC into a combination of two supported
1241   // SELECT_CC operations.
1242   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1243 
1244   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1245       Cond, HWFalse,
1246       True, False,
1247       DAG.getCondCode(ISD::SETNE));
1248 }
1249 
1250 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1251 /// convert these pointers to a register index.  Each register holds
1252 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1253 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1254 /// for indirect addressing.
stackPtrToRegIndex(SDValue Ptr,unsigned StackWidth,SelectionDAG & DAG) const1255 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1256                                                unsigned StackWidth,
1257                                                SelectionDAG &DAG) const {
1258   unsigned SRLPad;
1259   switch(StackWidth) {
1260   case 1:
1261     SRLPad = 2;
1262     break;
1263   case 2:
1264     SRLPad = 3;
1265     break;
1266   case 4:
1267     SRLPad = 4;
1268     break;
1269   default: llvm_unreachable("Invalid stack width");
1270   }
1271 
1272   SDLoc DL(Ptr);
1273   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1274                      DAG.getConstant(SRLPad, DL, MVT::i32));
1275 }
1276 
getStackAddress(unsigned StackWidth,unsigned ElemIdx,unsigned & Channel,unsigned & PtrIncr) const1277 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1278                                          unsigned ElemIdx,
1279                                          unsigned &Channel,
1280                                          unsigned &PtrIncr) const {
1281   switch (StackWidth) {
1282   default:
1283   case 1:
1284     Channel = 0;
1285     if (ElemIdx > 0) {
1286       PtrIncr = 1;
1287     } else {
1288       PtrIncr = 0;
1289     }
1290     break;
1291   case 2:
1292     Channel = ElemIdx % 2;
1293     if (ElemIdx == 2) {
1294       PtrIncr = 1;
1295     } else {
1296       PtrIncr = 0;
1297     }
1298     break;
1299   case 4:
1300     Channel = ElemIdx;
1301     PtrIncr = 0;
1302     break;
1303   }
1304 }
1305 
lowerPrivateTruncStore(StoreSDNode * Store,SelectionDAG & DAG) const1306 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1307                                                    SelectionDAG &DAG) const {
1308   SDLoc DL(Store);
1309 
1310   unsigned Mask = 0;
1311   if (Store->getMemoryVT() == MVT::i8) {
1312     Mask = 0xff;
1313   } else if (Store->getMemoryVT() == MVT::i16) {
1314     Mask = 0xffff;
1315   }
1316 
1317   SDValue Chain = Store->getChain();
1318   SDValue BasePtr = Store->getBasePtr();
1319   EVT MemVT = Store->getMemoryVT();
1320 
1321   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1322                             DAG.getConstant(2, DL, MVT::i32));
1323   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1324                             Chain, Ptr,
1325                             DAG.getTargetConstant(0, DL, MVT::i32));
1326 
1327   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1328                                 DAG.getConstant(0x3, DL, MVT::i32));
1329 
1330   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1331                                  DAG.getConstant(3, DL, MVT::i32));
1332 
1333   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1334                                   Store->getValue());
1335 
1336   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1337 
1338   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1339                                      MaskedValue, ShiftAmt);
1340 
1341   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1342                                 DAG.getConstant(Mask, DL, MVT::i32),
1343                                 ShiftAmt);
1344   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1345                         DAG.getConstant(0xffffffff, DL, MVT::i32));
1346   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1347 
1348   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1349   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1350                      Chain, Value, Ptr,
1351                      DAG.getTargetConstant(0, DL, MVT::i32));
1352 }
1353 
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1354 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1355   if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1356     return Result;
1357 
1358   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1359   unsigned AS = StoreNode->getAddressSpace();
1360   SDValue Value = StoreNode->getValue();
1361   EVT ValueVT = Value.getValueType();
1362 
1363   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1364       ValueVT.isVector()) {
1365     return SplitVectorStore(Op, DAG);
1366   }
1367 
1368   SDLoc DL(Op);
1369   SDValue Chain = StoreNode->getChain();
1370   SDValue Ptr = StoreNode->getBasePtr();
1371 
1372   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1373     if (StoreNode->isTruncatingStore()) {
1374       EVT VT = Value.getValueType();
1375       assert(VT.bitsLE(MVT::i32));
1376       EVT MemVT = StoreNode->getMemoryVT();
1377       SDValue MaskConstant;
1378       if (MemVT == MVT::i8) {
1379         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1380       } else {
1381         assert(MemVT == MVT::i16);
1382         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1383       }
1384       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1385                                       DAG.getConstant(2, DL, MVT::i32));
1386       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1387                                       DAG.getConstant(0x00000003, DL, VT));
1388       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1389       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1390                                    DAG.getConstant(3, DL, VT));
1391       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1392       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1393       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1394       // vector instead.
1395       SDValue Src[4] = {
1396         ShiftedValue,
1397         DAG.getConstant(0, DL, MVT::i32),
1398         DAG.getConstant(0, DL, MVT::i32),
1399         Mask
1400       };
1401       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1402       SDValue Args[3] = { Chain, Input, DWordAddr };
1403       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1404                                      Op->getVTList(), Args, MemVT,
1405                                      StoreNode->getMemOperand());
1406     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1407                ValueVT.bitsGE(MVT::i32)) {
1408       // Convert pointer from byte address to dword address.
1409       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1410                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1411                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1412 
1413       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1414         llvm_unreachable("Truncated and indexed stores not supported yet");
1415       } else {
1416         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1417       }
1418       return Chain;
1419     }
1420   }
1421 
1422   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1423     return SDValue();
1424 
1425   EVT MemVT = StoreNode->getMemoryVT();
1426   if (MemVT.bitsLT(MVT::i32))
1427     return lowerPrivateTruncStore(StoreNode, DAG);
1428 
1429   // Lowering for indirect addressing
1430   const MachineFunction &MF = DAG.getMachineFunction();
1431   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1432   unsigned StackWidth = TFL->getStackWidth(MF);
1433 
1434   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1435 
1436   if (ValueVT.isVector()) {
1437     unsigned NumElemVT = ValueVT.getVectorNumElements();
1438     EVT ElemVT = ValueVT.getVectorElementType();
1439     SmallVector<SDValue, 4> Stores(NumElemVT);
1440 
1441     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1442                                       "vector width in load");
1443 
1444     for (unsigned i = 0; i < NumElemVT; ++i) {
1445       unsigned Channel, PtrIncr;
1446       getStackAddress(StackWidth, i, Channel, PtrIncr);
1447       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1448                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1449       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1450                                  Value, DAG.getConstant(i, DL, MVT::i32));
1451 
1452       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1453                               Chain, Elem, Ptr,
1454                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1455     }
1456      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1457    } else {
1458     if (ValueVT == MVT::i8) {
1459       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1460     }
1461     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1462     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1463   }
1464 
1465   return Chain;
1466 }
1467 
1468 // return (512 + (kc_bank << 12)
1469 static int
ConstantAddressBlock(unsigned AddressSpace)1470 ConstantAddressBlock(unsigned AddressSpace) {
1471   switch (AddressSpace) {
1472   case AMDGPUAS::CONSTANT_BUFFER_0:
1473     return 512;
1474   case AMDGPUAS::CONSTANT_BUFFER_1:
1475     return 512 + 4096;
1476   case AMDGPUAS::CONSTANT_BUFFER_2:
1477     return 512 + 4096 * 2;
1478   case AMDGPUAS::CONSTANT_BUFFER_3:
1479     return 512 + 4096 * 3;
1480   case AMDGPUAS::CONSTANT_BUFFER_4:
1481     return 512 + 4096 * 4;
1482   case AMDGPUAS::CONSTANT_BUFFER_5:
1483     return 512 + 4096 * 5;
1484   case AMDGPUAS::CONSTANT_BUFFER_6:
1485     return 512 + 4096 * 6;
1486   case AMDGPUAS::CONSTANT_BUFFER_7:
1487     return 512 + 4096 * 7;
1488   case AMDGPUAS::CONSTANT_BUFFER_8:
1489     return 512 + 4096 * 8;
1490   case AMDGPUAS::CONSTANT_BUFFER_9:
1491     return 512 + 4096 * 9;
1492   case AMDGPUAS::CONSTANT_BUFFER_10:
1493     return 512 + 4096 * 10;
1494   case AMDGPUAS::CONSTANT_BUFFER_11:
1495     return 512 + 4096 * 11;
1496   case AMDGPUAS::CONSTANT_BUFFER_12:
1497     return 512 + 4096 * 12;
1498   case AMDGPUAS::CONSTANT_BUFFER_13:
1499     return 512 + 4096 * 13;
1500   case AMDGPUAS::CONSTANT_BUFFER_14:
1501     return 512 + 4096 * 14;
1502   case AMDGPUAS::CONSTANT_BUFFER_15:
1503     return 512 + 4096 * 15;
1504   default:
1505     return -1;
1506   }
1507 }
1508 
lowerPrivateExtLoad(SDValue Op,SelectionDAG & DAG) const1509 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1510                                                 SelectionDAG &DAG) const {
1511   SDLoc DL(Op);
1512   LoadSDNode *Load = cast<LoadSDNode>(Op);
1513   ISD::LoadExtType ExtType = Load->getExtensionType();
1514   EVT MemVT = Load->getMemoryVT();
1515 
1516   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1517   // register (2-)byte extract.
1518 
1519   // Get Register holding the target.
1520   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1521                             DAG.getConstant(2, DL, MVT::i32));
1522   // Load the Register.
1523   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1524                             Load->getChain(),
1525                             Ptr,
1526                             DAG.getTargetConstant(0, DL, MVT::i32),
1527                             Op.getOperand(2));
1528 
1529   // Get offset within the register.
1530   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1531                                 Load->getBasePtr(),
1532                                 DAG.getConstant(0x3, DL, MVT::i32));
1533 
1534   // Bit offset of target byte (byteIdx * 8).
1535   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1536                                  DAG.getConstant(3, DL, MVT::i32));
1537 
1538   // Shift to the right.
1539   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1540 
1541   // Eliminate the upper bits by setting them to ...
1542   EVT MemEltVT = MemVT.getScalarType();
1543 
1544   // ... ones.
1545   if (ExtType == ISD::SEXTLOAD) {
1546     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1547 
1548     SDValue Ops[] = {
1549       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1550       Load->getChain()
1551     };
1552 
1553     return DAG.getMergeValues(Ops, DL);
1554   }
1555 
1556   // ... or zeros.
1557   SDValue Ops[] = {
1558     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1559     Load->getChain()
1560   };
1561 
1562   return DAG.getMergeValues(Ops, DL);
1563 }
1564 
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1565 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1566   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1567   unsigned AS = LoadNode->getAddressSpace();
1568   EVT MemVT = LoadNode->getMemoryVT();
1569   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1570 
1571   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1572       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1573     return lowerPrivateExtLoad(Op, DAG);
1574   }
1575 
1576   SDLoc DL(Op);
1577   EVT VT = Op.getValueType();
1578   SDValue Chain = LoadNode->getChain();
1579   SDValue Ptr = LoadNode->getBasePtr();
1580 
1581   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1582     SDValue MergedValues[2] = {
1583       scalarizeVectorLoad(LoadNode, DAG),
1584       Chain
1585     };
1586     return DAG.getMergeValues(MergedValues, DL);
1587   }
1588 
1589   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1590   if (ConstantBlock > -1 &&
1591       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1592        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1593     SDValue Result;
1594     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1595         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1596         isa<ConstantSDNode>(Ptr)) {
1597       SDValue Slots[4];
1598       for (unsigned i = 0; i < 4; i++) {
1599         // We want Const position encoded with the following formula :
1600         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1601         // const_index is Ptr computed by llvm using an alignment of 16.
1602         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1603         // then div by 4 at the ISel step
1604         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1605             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1606         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1607       }
1608       EVT NewVT = MVT::v4i32;
1609       unsigned NumElements = 4;
1610       if (VT.isVector()) {
1611         NewVT = VT;
1612         NumElements = VT.getVectorNumElements();
1613       }
1614       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1615     } else {
1616       // non-constant ptr can't be folded, keeps it as a v4f32 load
1617       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1618           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1619                       DAG.getConstant(4, DL, MVT::i32)),
1620                       DAG.getConstant(LoadNode->getAddressSpace() -
1621                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1622           );
1623     }
1624 
1625     if (!VT.isVector()) {
1626       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1627                            DAG.getConstant(0, DL, MVT::i32));
1628     }
1629 
1630     SDValue MergedValues[2] = {
1631       Result,
1632       Chain
1633     };
1634     return DAG.getMergeValues(MergedValues, DL);
1635   }
1636 
1637   SDValue LoweredLoad;
1638 
1639   // For most operations returning SDValue() will result in the node being
1640   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1641   // need to manually expand loads that may be legal in some address spaces and
1642   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1643   // compute shaders, since the data is sign extended when it is uploaded to the
1644   // buffer. However SEXT loads from other address spaces are not supported, so
1645   // we need to expand them here.
1646   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1647     EVT MemVT = LoadNode->getMemoryVT();
1648     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1649     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1650                                   LoadNode->getPointerInfo(), MemVT,
1651                                   LoadNode->isVolatile(),
1652                                   LoadNode->isNonTemporal(),
1653                                   LoadNode->isInvariant(),
1654                                   LoadNode->getAlignment());
1655     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1656                               DAG.getValueType(MemVT));
1657 
1658     SDValue MergedValues[2] = { Res, Chain };
1659     return DAG.getMergeValues(MergedValues, DL);
1660   }
1661 
1662   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1663     return SDValue();
1664   }
1665 
1666   // Lowering for indirect addressing
1667   const MachineFunction &MF = DAG.getMachineFunction();
1668   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1669   unsigned StackWidth = TFL->getStackWidth(MF);
1670 
1671   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1672 
1673   if (VT.isVector()) {
1674     unsigned NumElemVT = VT.getVectorNumElements();
1675     EVT ElemVT = VT.getVectorElementType();
1676     SDValue Loads[4];
1677 
1678     assert(NumElemVT <= 4);
1679     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1680                                       "vector width in load");
1681 
1682     for (unsigned i = 0; i < NumElemVT; ++i) {
1683       unsigned Channel, PtrIncr;
1684       getStackAddress(StackWidth, i, Channel, PtrIncr);
1685       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1686                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1687       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1688                              Chain, Ptr,
1689                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1690                              Op.getOperand(2));
1691     }
1692     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1693     LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1694   } else {
1695     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1696                               Chain, Ptr,
1697                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1698                               Op.getOperand(2));
1699   }
1700 
1701   SDValue Ops[2] = {
1702     LoweredLoad,
1703     Chain
1704   };
1705 
1706   return DAG.getMergeValues(Ops, DL);
1707 }
1708 
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const1709 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1710   SDValue Chain = Op.getOperand(0);
1711   SDValue Cond  = Op.getOperand(1);
1712   SDValue Jump  = Op.getOperand(2);
1713 
1714   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1715                      Chain, Jump, Cond);
1716 }
1717 
lowerFrameIndex(SDValue Op,SelectionDAG & DAG) const1718 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1719                                             SelectionDAG &DAG) const {
1720   MachineFunction &MF = DAG.getMachineFunction();
1721   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1722 
1723   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1724 
1725   unsigned FrameIndex = FIN->getIndex();
1726   unsigned IgnoredFrameReg;
1727   unsigned Offset =
1728     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1729   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1730                          Op.getValueType());
1731 }
1732 
1733 /// XXX Only kernel functions are supported, so we can assume for now that
1734 /// every function is a kernel function, but in the future we should use
1735 /// separate calling conventions for kernel and non-kernel functions.
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1736 SDValue R600TargetLowering::LowerFormalArguments(
1737     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1738     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1739     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1740   SmallVector<CCValAssign, 16> ArgLocs;
1741   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1742                  *DAG.getContext());
1743   MachineFunction &MF = DAG.getMachineFunction();
1744   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1745 
1746   SmallVector<ISD::InputArg, 8> LocalIns;
1747 
1748   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1749 
1750   AnalyzeFormalArguments(CCInfo, LocalIns);
1751 
1752   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1753     CCValAssign &VA = ArgLocs[i];
1754     const ISD::InputArg &In = Ins[i];
1755     EVT VT = In.VT;
1756     EVT MemVT = VA.getLocVT();
1757     if (!VT.isVector() && MemVT.isVector()) {
1758       // Get load source type if scalarized.
1759       MemVT = MemVT.getVectorElementType();
1760     }
1761 
1762     if (AMDGPU::isShader(CallConv)) {
1763       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1764       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1765       InVals.push_back(Register);
1766       continue;
1767     }
1768 
1769     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1770                                           AMDGPUAS::CONSTANT_BUFFER_0);
1771 
1772     // i64 isn't a legal type, so the register type used ends up as i32, which
1773     // isn't expected here. It attempts to create this sextload, but it ends up
1774     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1775     // for <1 x i64>.
1776 
1777     // The first 36 bytes of the input buffer contains information about
1778     // thread group and global sizes.
1779     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1780     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1781       // FIXME: This should really check the extload type, but the handling of
1782       // extload vector parameters seems to be broken.
1783 
1784       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1785       Ext = ISD::SEXTLOAD;
1786     }
1787 
1788     // Compute the offset from the value.
1789     // XXX - I think PartOffset should give you this, but it seems to give the
1790     // size of the register which isn't useful.
1791 
1792     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1793     unsigned PartOffset = VA.getLocMemOffset();
1794     unsigned Offset = 36 + VA.getLocMemOffset();
1795 
1796     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1797     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1798                               DAG.getConstant(Offset, DL, MVT::i32),
1799                               DAG.getUNDEF(MVT::i32),
1800                               PtrInfo,
1801                               MemVT, false, true, true, 4);
1802 
1803     // 4 is the preferred alignment for the CONSTANT memory space.
1804     InVals.push_back(Arg);
1805     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1806   }
1807   return Chain;
1808 }
1809 
getSetCCResultType(const DataLayout & DL,LLVMContext &,EVT VT) const1810 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1811                                            EVT VT) const {
1812    if (!VT.isVector())
1813      return MVT::i32;
1814    return VT.changeVectorElementTypeToInteger();
1815 }
1816 
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,unsigned Align,bool * IsFast) const1817 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1818                                                         unsigned AddrSpace,
1819                                                         unsigned Align,
1820                                                         bool *IsFast) const {
1821   if (IsFast)
1822     *IsFast = false;
1823 
1824   if (!VT.isSimple() || VT == MVT::Other)
1825     return false;
1826 
1827   if (VT.bitsLT(MVT::i32))
1828     return false;
1829 
1830   // TODO: This is a rough estimate.
1831   if (IsFast)
1832     *IsFast = true;
1833 
1834   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1835 }
1836 
CompactSwizzlableVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1837 static SDValue CompactSwizzlableVector(
1838   SelectionDAG &DAG, SDValue VectorEntry,
1839   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1840   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1841   assert(RemapSwizzle.empty());
1842   SDValue NewBldVec[4] = {
1843     VectorEntry.getOperand(0),
1844     VectorEntry.getOperand(1),
1845     VectorEntry.getOperand(2),
1846     VectorEntry.getOperand(3)
1847   };
1848 
1849   for (unsigned i = 0; i < 4; i++) {
1850     if (NewBldVec[i].isUndef())
1851       // We mask write here to teach later passes that the ith element of this
1852       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1853       // break false dependencies and additionnaly make assembly easier to read.
1854       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1855     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1856       if (C->isZero()) {
1857         RemapSwizzle[i] = 4; // SEL_0
1858         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1859       } else if (C->isExactlyValue(1.0)) {
1860         RemapSwizzle[i] = 5; // SEL_1
1861         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1862       }
1863     }
1864 
1865     if (NewBldVec[i].isUndef())
1866       continue;
1867     for (unsigned j = 0; j < i; j++) {
1868       if (NewBldVec[i] == NewBldVec[j]) {
1869         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1870         RemapSwizzle[i] = j;
1871         break;
1872       }
1873     }
1874   }
1875 
1876   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1877                             NewBldVec);
1878 }
1879 
ReorganizeVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1880 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1881                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1882   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1883   assert(RemapSwizzle.empty());
1884   SDValue NewBldVec[4] = {
1885       VectorEntry.getOperand(0),
1886       VectorEntry.getOperand(1),
1887       VectorEntry.getOperand(2),
1888       VectorEntry.getOperand(3)
1889   };
1890   bool isUnmovable[4] = { false, false, false, false };
1891   for (unsigned i = 0; i < 4; i++) {
1892     RemapSwizzle[i] = i;
1893     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1894       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1895           ->getZExtValue();
1896       if (i == Idx)
1897         isUnmovable[Idx] = true;
1898     }
1899   }
1900 
1901   for (unsigned i = 0; i < 4; i++) {
1902     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1903       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1904           ->getZExtValue();
1905       if (isUnmovable[Idx])
1906         continue;
1907       // Swap i and Idx
1908       std::swap(NewBldVec[Idx], NewBldVec[i]);
1909       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1910       break;
1911     }
1912   }
1913 
1914   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1915                             NewBldVec);
1916 }
1917 
OptimizeSwizzle(SDValue BuildVector,SDValue Swz[4],SelectionDAG & DAG,const SDLoc & DL) const1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1919                                             SelectionDAG &DAG,
1920                                             const SDLoc &DL) const {
1921   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1922   // Old -> New swizzle values
1923   DenseMap<unsigned, unsigned> SwizzleRemap;
1924 
1925   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1926   for (unsigned i = 0; i < 4; i++) {
1927     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1928     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1929       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1930   }
1931 
1932   SwizzleRemap.clear();
1933   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1934   for (unsigned i = 0; i < 4; i++) {
1935     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1936     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1937       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1938   }
1939 
1940   return BuildVector;
1941 }
1942 
1943 
1944 //===----------------------------------------------------------------------===//
1945 // Custom DAG Optimizations
1946 //===----------------------------------------------------------------------===//
1947 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1949                                               DAGCombinerInfo &DCI) const {
1950   SelectionDAG &DAG = DCI.DAG;
1951 
1952   switch (N->getOpcode()) {
1953   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1954   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1955   case ISD::FP_ROUND: {
1956       SDValue Arg = N->getOperand(0);
1957       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1958         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1959                            Arg.getOperand(0));
1960       }
1961       break;
1962     }
1963 
1964   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1965   // (i32 select_cc f32, f32, -1, 0 cc)
1966   //
1967   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1968   // this to one of the SET*_DX10 instructions.
1969   case ISD::FP_TO_SINT: {
1970     SDValue FNeg = N->getOperand(0);
1971     if (FNeg.getOpcode() != ISD::FNEG) {
1972       return SDValue();
1973     }
1974     SDValue SelectCC = FNeg.getOperand(0);
1975     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1976         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1977         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1978         !isHWTrueValue(SelectCC.getOperand(2)) ||
1979         !isHWFalseValue(SelectCC.getOperand(3))) {
1980       return SDValue();
1981     }
1982 
1983     SDLoc dl(N);
1984     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1985                            SelectCC.getOperand(0), // LHS
1986                            SelectCC.getOperand(1), // RHS
1987                            DAG.getConstant(-1, dl, MVT::i32), // True
1988                            DAG.getConstant(0, dl, MVT::i32),  // False
1989                            SelectCC.getOperand(4)); // CC
1990 
1991     break;
1992   }
1993 
1994   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1995   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1996   case ISD::INSERT_VECTOR_ELT: {
1997     SDValue InVec = N->getOperand(0);
1998     SDValue InVal = N->getOperand(1);
1999     SDValue EltNo = N->getOperand(2);
2000     SDLoc dl(N);
2001 
2002     // If the inserted element is an UNDEF, just use the input vector.
2003     if (InVal.isUndef())
2004       return InVec;
2005 
2006     EVT VT = InVec.getValueType();
2007 
2008     // If we can't generate a legal BUILD_VECTOR, exit
2009     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2010       return SDValue();
2011 
2012     // Check that we know which element is being inserted
2013     if (!isa<ConstantSDNode>(EltNo))
2014       return SDValue();
2015     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2016 
2017     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2018     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
2019     // vector elements.
2020     SmallVector<SDValue, 8> Ops;
2021     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2022       Ops.append(InVec.getNode()->op_begin(),
2023                  InVec.getNode()->op_end());
2024     } else if (InVec.isUndef()) {
2025       unsigned NElts = VT.getVectorNumElements();
2026       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2027     } else {
2028       return SDValue();
2029     }
2030 
2031     // Insert the element
2032     if (Elt < Ops.size()) {
2033       // All the operands of BUILD_VECTOR must have the same type;
2034       // we enforce that here.
2035       EVT OpVT = Ops[0].getValueType();
2036       if (InVal.getValueType() != OpVT)
2037         InVal = OpVT.bitsGT(InVal.getValueType()) ?
2038           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2039           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2040       Ops[Elt] = InVal;
2041     }
2042 
2043     // Return the new vector
2044     return DAG.getBuildVector(VT, dl, Ops);
2045   }
2046 
2047   // Extract_vec (Build_vector) generated by custom lowering
2048   // also needs to be customly combined
2049   case ISD::EXTRACT_VECTOR_ELT: {
2050     SDValue Arg = N->getOperand(0);
2051     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2052       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2053         unsigned Element = Const->getZExtValue();
2054         return Arg->getOperand(Element);
2055       }
2056     }
2057     if (Arg.getOpcode() == ISD::BITCAST &&
2058         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2059       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2060         unsigned Element = Const->getZExtValue();
2061         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2062             Arg->getOperand(0).getOperand(Element));
2063       }
2064     }
2065     break;
2066   }
2067 
2068   case ISD::SELECT_CC: {
2069     // Try common optimizations
2070     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2071       return Ret;
2072 
2073     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2074     //      selectcc x, y, a, b, inv(cc)
2075     //
2076     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2077     //      selectcc x, y, a, b, cc
2078     SDValue LHS = N->getOperand(0);
2079     if (LHS.getOpcode() != ISD::SELECT_CC) {
2080       return SDValue();
2081     }
2082 
2083     SDValue RHS = N->getOperand(1);
2084     SDValue True = N->getOperand(2);
2085     SDValue False = N->getOperand(3);
2086     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2087 
2088     if (LHS.getOperand(2).getNode() != True.getNode() ||
2089         LHS.getOperand(3).getNode() != False.getNode() ||
2090         RHS.getNode() != False.getNode()) {
2091       return SDValue();
2092     }
2093 
2094     switch (NCC) {
2095     default: return SDValue();
2096     case ISD::SETNE: return LHS;
2097     case ISD::SETEQ: {
2098       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2099       LHSCC = ISD::getSetCCInverse(LHSCC,
2100                                   LHS.getOperand(0).getValueType().isInteger());
2101       if (DCI.isBeforeLegalizeOps() ||
2102           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2103         return DAG.getSelectCC(SDLoc(N),
2104                                LHS.getOperand(0),
2105                                LHS.getOperand(1),
2106                                LHS.getOperand(2),
2107                                LHS.getOperand(3),
2108                                LHSCC);
2109       break;
2110     }
2111     }
2112     return SDValue();
2113   }
2114 
2115   case AMDGPUISD::EXPORT: {
2116     SDValue Arg = N->getOperand(1);
2117     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2118       break;
2119 
2120     SDValue NewArgs[8] = {
2121       N->getOperand(0), // Chain
2122       SDValue(),
2123       N->getOperand(2), // ArrayBase
2124       N->getOperand(3), // Type
2125       N->getOperand(4), // SWZ_X
2126       N->getOperand(5), // SWZ_Y
2127       N->getOperand(6), // SWZ_Z
2128       N->getOperand(7) // SWZ_W
2129     };
2130     SDLoc DL(N);
2131     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2132     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2133   }
2134   case AMDGPUISD::TEXTURE_FETCH: {
2135     SDValue Arg = N->getOperand(1);
2136     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2137       break;
2138 
2139     SDValue NewArgs[19] = {
2140       N->getOperand(0),
2141       N->getOperand(1),
2142       N->getOperand(2),
2143       N->getOperand(3),
2144       N->getOperand(4),
2145       N->getOperand(5),
2146       N->getOperand(6),
2147       N->getOperand(7),
2148       N->getOperand(8),
2149       N->getOperand(9),
2150       N->getOperand(10),
2151       N->getOperand(11),
2152       N->getOperand(12),
2153       N->getOperand(13),
2154       N->getOperand(14),
2155       N->getOperand(15),
2156       N->getOperand(16),
2157       N->getOperand(17),
2158       N->getOperand(18),
2159     };
2160     SDLoc DL(N);
2161     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2162     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2163   }
2164   }
2165 
2166   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2167 }
2168 
FoldOperand(SDNode * ParentNode,unsigned SrcIdx,SDValue & Src,SDValue & Neg,SDValue & Abs,SDValue & Sel,SDValue & Imm,SelectionDAG & DAG) const2169 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2170                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
2171                                      SDValue &Sel, SDValue &Imm,
2172                                      SelectionDAG &DAG) const {
2173   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2174   if (!Src.isMachineOpcode())
2175     return false;
2176 
2177   switch (Src.getMachineOpcode()) {
2178   case AMDGPU::FNEG_R600:
2179     if (!Neg.getNode())
2180       return false;
2181     Src = Src.getOperand(0);
2182     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2183     return true;
2184   case AMDGPU::FABS_R600:
2185     if (!Abs.getNode())
2186       return false;
2187     Src = Src.getOperand(0);
2188     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2189     return true;
2190   case AMDGPU::CONST_COPY: {
2191     unsigned Opcode = ParentNode->getMachineOpcode();
2192     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2193 
2194     if (!Sel.getNode())
2195       return false;
2196 
2197     SDValue CstOffset = Src.getOperand(0);
2198     if (ParentNode->getValueType(0).isVector())
2199       return false;
2200 
2201     // Gather constants values
2202     int SrcIndices[] = {
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2214     };
2215     std::vector<unsigned> Consts;
2216     for (int OtherSrcIdx : SrcIndices) {
2217       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2218       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2219         continue;
2220       if (HasDst) {
2221         OtherSrcIdx--;
2222         OtherSelIdx--;
2223       }
2224       if (RegisterSDNode *Reg =
2225           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2226         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2227           ConstantSDNode *Cst
2228             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2229           Consts.push_back(Cst->getZExtValue());
2230         }
2231       }
2232     }
2233 
2234     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2235     Consts.push_back(Cst->getZExtValue());
2236     if (!TII->fitsConstReadLimitations(Consts)) {
2237       return false;
2238     }
2239 
2240     Sel = CstOffset;
2241     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2242     return true;
2243   }
2244   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2245     // Check if the Imm slot is used. Taken from below.
2246     if (cast<ConstantSDNode>(Imm)->getZExtValue())
2247       return false;
2248     Imm = Src.getOperand(0);
2249     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2250     return true;
2251   case AMDGPU::MOV_IMM_I32:
2252   case AMDGPU::MOV_IMM_F32: {
2253     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2254     uint64_t ImmValue = 0;
2255 
2256 
2257     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2258       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2259       float FloatValue = FPC->getValueAPF().convertToFloat();
2260       if (FloatValue == 0.0) {
2261         ImmReg = AMDGPU::ZERO;
2262       } else if (FloatValue == 0.5) {
2263         ImmReg = AMDGPU::HALF;
2264       } else if (FloatValue == 1.0) {
2265         ImmReg = AMDGPU::ONE;
2266       } else {
2267         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2268       }
2269     } else {
2270       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2271       uint64_t Value = C->getZExtValue();
2272       if (Value == 0) {
2273         ImmReg = AMDGPU::ZERO;
2274       } else if (Value == 1) {
2275         ImmReg = AMDGPU::ONE_INT;
2276       } else {
2277         ImmValue = Value;
2278       }
2279     }
2280 
2281     // Check that we aren't already using an immediate.
2282     // XXX: It's possible for an instruction to have more than one
2283     // immediate operand, but this is not supported yet.
2284     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2285       if (!Imm.getNode())
2286         return false;
2287       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2288       assert(C);
2289       if (C->getZExtValue())
2290         return false;
2291       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2292     }
2293     Src = DAG.getRegister(ImmReg, MVT::i32);
2294     return true;
2295   }
2296   default:
2297     return false;
2298   }
2299 }
2300 
2301 /// \brief Fold the instructions after selecting them
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const2302 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2303                                             SelectionDAG &DAG) const {
2304   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2305   if (!Node->isMachineOpcode())
2306     return Node;
2307 
2308   unsigned Opcode = Node->getMachineOpcode();
2309   SDValue FakeOp;
2310 
2311   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2312 
2313   if (Opcode == AMDGPU::DOT_4) {
2314     int OperandIdx[] = {
2315       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2316       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2317       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2320       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2321       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2323         };
2324     int NegIdx[] = {
2325       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2326       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2327       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2328       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2329       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2330       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2331       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2332       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2333     };
2334     int AbsIdx[] = {
2335       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2336       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2337       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2338       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2339       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2340       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2341       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2342       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2343     };
2344     for (unsigned i = 0; i < 8; i++) {
2345       if (OperandIdx[i] < 0)
2346         return Node;
2347       SDValue &Src = Ops[OperandIdx[i] - 1];
2348       SDValue &Neg = Ops[NegIdx[i] - 1];
2349       SDValue &Abs = Ops[AbsIdx[i] - 1];
2350       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2351       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2352       if (HasDst)
2353         SelIdx--;
2354       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2355       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2356         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2357     }
2358   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2359     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2360       SDValue &Src = Ops[i];
2361       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2362         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2363     }
2364   } else if (Opcode == AMDGPU::CLAMP_R600) {
2365     SDValue Src = Node->getOperand(0);
2366     if (!Src.isMachineOpcode() ||
2367         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2368       return Node;
2369     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2370         AMDGPU::OpName::clamp);
2371     if (ClampIdx < 0)
2372       return Node;
2373     SDLoc DL(Node);
2374     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2375     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2376     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2377                               Node->getVTList(), Ops);
2378   } else {
2379     if (!TII->hasInstrModifiers(Opcode))
2380       return Node;
2381     int OperandIdx[] = {
2382       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2383       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2384       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2385     };
2386     int NegIdx[] = {
2387       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2388       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2389       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2390     };
2391     int AbsIdx[] = {
2392       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2393       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2394       -1
2395     };
2396     for (unsigned i = 0; i < 3; i++) {
2397       if (OperandIdx[i] < 0)
2398         return Node;
2399       SDValue &Src = Ops[OperandIdx[i] - 1];
2400       SDValue &Neg = Ops[NegIdx[i] - 1];
2401       SDValue FakeAbs;
2402       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2403       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2404       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2405       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2406       if (HasDst) {
2407         SelIdx--;
2408         ImmIdx--;
2409       }
2410       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2411       SDValue &Imm = Ops[ImmIdx];
2412       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2413         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2414     }
2415   }
2416 
2417   return Node;
2418 }
2419