• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Most of the DAG lowering is handled in AMDGPUISelLowering.cpp.  This file is
11 // mostly EmitInstrWithCustomInserter().
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIISelLowering.h"
16 #include "AMDIL.h"
17 #include "AMDILIntrinsicInfo.h"
18 #include "SIInstrInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #include "llvm/CodeGen/SelectionDAG.h"
23 
24 using namespace llvm;
25 
SITargetLowering(TargetMachine & TM)26 SITargetLowering::SITargetLowering(TargetMachine &TM) :
27     AMDGPUTargetLowering(TM),
28     TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo()))
29 {
30   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
31   addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
32   addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
33   addRegisterClass(MVT::i64, &AMDGPU::VReg_64RegClass);
34   addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
35   addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
36 
37   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
38   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
39 
40   computeRegisterProperties();
41 
42   setOperationAction(ISD::AND, MVT::i1, Custom);
43 
44   setOperationAction(ISD::ADD, MVT::i64, Legal);
45   setOperationAction(ISD::ADD, MVT::i32, Legal);
46 
47   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
48 
49   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
50 
51   // We need to custom lower loads from the USER_SGPR address space, so we can
52   // add the SGPRs as livein registers.
53   setOperationAction(ISD::LOAD, MVT::i32, Custom);
54   setOperationAction(ISD::LOAD, MVT::i64, Custom);
55 
56   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
57   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
58 
59   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
60   setTargetDAGCombine(ISD::SELECT_CC);
61 
62   setTargetDAGCombine(ISD::SETCC);
63 }
64 
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const65 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
66     MachineInstr * MI, MachineBasicBlock * BB) const
67 {
68   const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
69   MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
70   MachineBasicBlock::iterator I = MI;
71 
72   if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
73     AppendS_WAITCNT(MI, *BB, llvm::next(I));
74     return BB;
75   }
76 
77   switch (MI->getOpcode()) {
78   default:
79     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
80 
81   case AMDGPU::CLAMP_SI:
82     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
83            .addOperand(MI->getOperand(0))
84            .addOperand(MI->getOperand(1))
85            // VSRC1-2 are unused, but we still need to fill all the
86            // operand slots, so we just reuse the VSRC0 operand
87            .addOperand(MI->getOperand(1))
88            .addOperand(MI->getOperand(1))
89            .addImm(0) // ABS
90            .addImm(1) // CLAMP
91            .addImm(0) // OMOD
92            .addImm(0); // NEG
93     MI->eraseFromParent();
94     break;
95 
96   case AMDGPU::FABS_SI:
97     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
98                  .addOperand(MI->getOperand(0))
99                  .addOperand(MI->getOperand(1))
100                  // VSRC1-2 are unused, but we still need to fill all the
101                  // operand slots, so we just reuse the VSRC0 operand
102                  .addOperand(MI->getOperand(1))
103                  .addOperand(MI->getOperand(1))
104                  .addImm(1) // ABS
105                  .addImm(0) // CLAMP
106                  .addImm(0) // OMOD
107                  .addImm(0); // NEG
108     MI->eraseFromParent();
109     break;
110 
111   case AMDGPU::FNEG_SI:
112     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
113                  .addOperand(MI->getOperand(0))
114                  .addOperand(MI->getOperand(1))
115                  // VSRC1-2 are unused, but we still need to fill all the
116                  // operand slots, so we just reuse the VSRC0 operand
117                  .addOperand(MI->getOperand(1))
118                  .addOperand(MI->getOperand(1))
119                  .addImm(0) // ABS
120                  .addImm(0) // CLAMP
121                  .addImm(0) // OMOD
122                  .addImm(1); // NEG
123     MI->eraseFromParent();
124     break;
125 
126   case AMDGPU::SI_INTERP:
127     LowerSI_INTERP(MI, *BB, I, MRI);
128     break;
129   case AMDGPU::SI_INTERP_CONST:
130     LowerSI_INTERP_CONST(MI, *BB, I, MRI);
131     break;
132   case AMDGPU::SI_KIL:
133     LowerSI_KIL(MI, *BB, I, MRI);
134     break;
135   case AMDGPU::SI_V_CNDLT:
136     LowerSI_V_CNDLT(MI, *BB, I, MRI);
137     break;
138   }
139   return BB;
140 }
141 
AppendS_WAITCNT(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I) const142 void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
143     MachineBasicBlock::iterator I) const
144 {
145   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
146           .addImm(0);
147 }
148 
LowerSI_INTERP(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const149 void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
150     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
151 {
152   unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
153   unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
154   MachineOperand dst = MI->getOperand(0);
155   MachineOperand iReg = MI->getOperand(1);
156   MachineOperand jReg = MI->getOperand(2);
157   MachineOperand attr_chan = MI->getOperand(3);
158   MachineOperand attr = MI->getOperand(4);
159   MachineOperand params = MI->getOperand(5);
160 
161   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
162           .addOperand(params);
163 
164   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
165           .addOperand(iReg)
166           .addOperand(attr_chan)
167           .addOperand(attr)
168 	  .addReg(M0);
169 
170   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
171           .addOperand(dst)
172           .addReg(tmp)
173           .addOperand(jReg)
174           .addOperand(attr_chan)
175           .addOperand(attr)
176 	  .addReg(M0);
177 
178   MI->eraseFromParent();
179 }
180 
LowerSI_INTERP_CONST(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const181 void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
182     MachineBasicBlock &BB, MachineBasicBlock::iterator I,
183     MachineRegisterInfo &MRI) const
184 {
185   MachineOperand dst = MI->getOperand(0);
186   MachineOperand attr_chan = MI->getOperand(1);
187   MachineOperand attr = MI->getOperand(2);
188   MachineOperand params = MI->getOperand(3);
189   unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
190 
191   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
192           .addOperand(params);
193 
194   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
195           .addOperand(dst)
196           .addOperand(attr_chan)
197           .addOperand(attr)
198 	  .addReg(M0);
199 
200   MI->eraseFromParent();
201 }
202 
LowerSI_KIL(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const203 void SITargetLowering::LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
204     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
205 {
206   // Clear this pixel from the exec mask if the operand is negative
207   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMPX_LE_F32_e32),
208           AMDGPU::VCC)
209           .addReg(AMDGPU::SREG_LIT_0)
210           .addOperand(MI->getOperand(0));
211 
212   // If the exec mask is non-zero, skip the next two instructions
213   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
214           .addImm(3)
215           .addReg(AMDGPU::EXEC);
216 
217   // Exec mask is zero: Export to NULL target...
218   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::EXP))
219           .addImm(0)
220           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
221           .addImm(0)
222           .addImm(1)
223           .addImm(1)
224           .addReg(AMDGPU::SREG_LIT_0)
225           .addReg(AMDGPU::SREG_LIT_0)
226           .addReg(AMDGPU::SREG_LIT_0)
227           .addReg(AMDGPU::SREG_LIT_0);
228 
229   // ... and terminate wavefront
230   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
231 
232   MI->eraseFromParent();
233 }
234 
LowerSI_V_CNDLT(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const235 void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
236     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
237 {
238   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMP_LT_F32_e32),
239           AMDGPU::VCC)
240           .addOperand(MI->getOperand(1))
241           .addReg(AMDGPU::SREG_LIT_0);
242 
243   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32))
244           .addOperand(MI->getOperand(0))
245 	  .addReg(AMDGPU::VCC)
246           .addOperand(MI->getOperand(2))
247           .addOperand(MI->getOperand(3));
248 
249   MI->eraseFromParent();
250 }
251 
getSetCCResultType(EVT VT) const252 EVT SITargetLowering::getSetCCResultType(EVT VT) const
253 {
254   return MVT::i1;
255 }
256 
257 //===----------------------------------------------------------------------===//
258 // Custom DAG Lowering Operations
259 //===----------------------------------------------------------------------===//
260 
LowerOperation(SDValue Op,SelectionDAG & DAG) const261 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
262 {
263   switch (Op.getOpcode()) {
264   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
265   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
266   case ISD::LOAD: return LowerLOAD(Op, DAG);
267   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
268   case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
269   case ISD::INTRINSIC_WO_CHAIN: {
270     unsigned IntrinsicID =
271                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
272     EVT VT = Op.getValueType();
273     switch (IntrinsicID) {
274     case AMDGPUIntrinsic::SI_vs_load_buffer_index:
275       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
276                                   AMDGPU::VGPR0, VT);
277     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
278     }
279     break;
280   }
281   }
282   return SDValue();
283 }
284 
285 /// Loweri1ContextSwitch - The function is for lowering i1 operations on the
286 /// VCC register.  In the VALU context, VCC is a one bit register, but in the
287 /// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
288 /// the SALU can perform operations on the VCC register, we need to promote
289 /// the operand types from i1 to i64 in order for tablegen to be able to match
290 /// this operation to the correct SALU instruction.  We do this promotion by
291 /// wrapping the operands in a CopyToReg node.
292 ///
Loweri1ContextSwitch(SDValue Op,SelectionDAG & DAG,unsigned VCCNode) const293 SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
294                                                SelectionDAG &DAG,
295                                                unsigned VCCNode) const
296 {
297   DebugLoc DL = Op.getDebugLoc();
298 
299   SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
300                                DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
301                                            Op.getOperand(0)),
302                                DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
303                                            Op.getOperand(1)));
304 
305   return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
306 }
307 
LowerBR_CC(SDValue Op,SelectionDAG & DAG) const308 SDValue SITargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
309 {
310   SDValue Chain = Op.getOperand(0);
311   SDValue CC = Op.getOperand(1);
312   SDValue LHS   = Op.getOperand(2);
313   SDValue RHS   = Op.getOperand(3);
314   SDValue JumpT  = Op.getOperand(4);
315   SDValue CmpValue;
316   SDValue Result;
317   CmpValue = DAG.getNode(
318       ISD::SETCC,
319       Op.getDebugLoc(),
320       MVT::i1,
321       LHS, RHS,
322       CC);
323 
324   Result = DAG.getNode(
325       AMDGPUISD::BRANCH_COND,
326       CmpValue.getDebugLoc(),
327       MVT::Other, Chain,
328       JumpT, CmpValue);
329   return Result;
330 }
331 
LowerLOAD(SDValue Op,SelectionDAG & DAG) const332 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
333 {
334   EVT VT = Op.getValueType();
335   LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
336 
337   assert(Ptr);
338 
339   unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
340 
341   // We only need to lower USER_SGPR address space loads
342   if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
343     return SDValue();
344   }
345 
346   // Loads from the USER_SGPR address space can only have constant value
347   // pointers.
348   ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
349   assert(BasePtr);
350 
351   unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
352   const TargetRegisterClass * dstClass;
353   switch (TypeDwordWidth) {
354     default:
355       assert(!"USER_SGPR value size not implemented");
356       return SDValue();
357     case 1:
358       dstClass = &AMDGPU::SReg_32RegClass;
359       break;
360     case 2:
361       dstClass = &AMDGPU::SReg_64RegClass;
362       break;
363   }
364   uint64_t Index = BasePtr->getZExtValue();
365   assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
366   unsigned SGPRIndex = Index / TypeDwordWidth;
367   unsigned Reg = dstClass->getRegister(SGPRIndex);
368 
369   DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
370                                                          VT));
371   return SDValue();
372 }
373 
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const374 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
375 {
376   SDValue LHS = Op.getOperand(0);
377   SDValue RHS = Op.getOperand(1);
378   SDValue True = Op.getOperand(2);
379   SDValue False = Op.getOperand(3);
380   SDValue CC = Op.getOperand(4);
381   EVT VT = Op.getValueType();
382   DebugLoc DL = Op.getDebugLoc();
383 
384   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
385   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
386 }
387 
388 //===----------------------------------------------------------------------===//
389 // Custom DAG optimizations
390 //===----------------------------------------------------------------------===//
391 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const392 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
393                                             DAGCombinerInfo &DCI) const {
394   SelectionDAG &DAG = DCI.DAG;
395   DebugLoc DL = N->getDebugLoc();
396   EVT VT = N->getValueType(0);
397 
398   switch (N->getOpcode()) {
399     default: break;
400     case ISD::SELECT_CC: {
401       N->dump();
402       ConstantSDNode *True, *False;
403       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
404       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
405           && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
406           && True->isAllOnesValue()
407           && False->isNullValue()
408           && VT == MVT::i1) {
409         return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
410                            N->getOperand(1), N->getOperand(4));
411 
412       }
413       break;
414     }
415     case ISD::SETCC: {
416       SDValue Arg0 = N->getOperand(0);
417       SDValue Arg1 = N->getOperand(1);
418       SDValue CC = N->getOperand(2);
419       ConstantSDNode * C = NULL;
420       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
421 
422       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
423       if (VT == MVT::i1
424           && Arg0.getOpcode() == ISD::SIGN_EXTEND
425           && Arg0.getOperand(0).getValueType() == MVT::i1
426           && (C = dyn_cast<ConstantSDNode>(Arg1))
427           && C->isNullValue()
428           && CCOp == ISD::SETNE) {
429         return SimplifySetCC(VT, Arg0.getOperand(0),
430                              DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
431       }
432       break;
433     }
434   }
435   return SDValue();
436 }
437 
438 #define NODE_NAME_CASE(node) case SIISD::node: return #node;
439 
getTargetNodeName(unsigned Opcode) const440 const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const
441 {
442   switch (Opcode) {
443   default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
444   NODE_NAME_CASE(VCC_AND)
445   NODE_NAME_CASE(VCC_BITCAST)
446   }
447 }
448