1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Most of the DAG lowering is handled in AMDGPUISelLowering.cpp. This file is
11 // mostly EmitInstrWithCustomInserter().
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "SIISelLowering.h"
16 #include "AMDIL.h"
17 #include "AMDILIntrinsicInfo.h"
18 #include "SIInstrInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #include "llvm/CodeGen/SelectionDAG.h"
23
24 using namespace llvm;
25
SITargetLowering(TargetMachine & TM)26 SITargetLowering::SITargetLowering(TargetMachine &TM) :
27 AMDGPUTargetLowering(TM),
28 TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo()))
29 {
30 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
31 addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
32 addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
33 addRegisterClass(MVT::i64, &AMDGPU::VReg_64RegClass);
34 addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
35 addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
36
37 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
38 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
39
40 computeRegisterProperties();
41
42 setOperationAction(ISD::AND, MVT::i1, Custom);
43
44 setOperationAction(ISD::ADD, MVT::i64, Legal);
45 setOperationAction(ISD::ADD, MVT::i32, Legal);
46
47 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
48
49 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
50
51 // We need to custom lower loads from the USER_SGPR address space, so we can
52 // add the SGPRs as livein registers.
53 setOperationAction(ISD::LOAD, MVT::i32, Custom);
54 setOperationAction(ISD::LOAD, MVT::i64, Custom);
55
56 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
57 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
58
59 setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
60 setTargetDAGCombine(ISD::SELECT_CC);
61
62 setTargetDAGCombine(ISD::SETCC);
63 }
64
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const65 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
66 MachineInstr * MI, MachineBasicBlock * BB) const
67 {
68 const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
69 MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
70 MachineBasicBlock::iterator I = MI;
71
72 if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
73 AppendS_WAITCNT(MI, *BB, llvm::next(I));
74 return BB;
75 }
76
77 switch (MI->getOpcode()) {
78 default:
79 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
80
81 case AMDGPU::CLAMP_SI:
82 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
83 .addOperand(MI->getOperand(0))
84 .addOperand(MI->getOperand(1))
85 // VSRC1-2 are unused, but we still need to fill all the
86 // operand slots, so we just reuse the VSRC0 operand
87 .addOperand(MI->getOperand(1))
88 .addOperand(MI->getOperand(1))
89 .addImm(0) // ABS
90 .addImm(1) // CLAMP
91 .addImm(0) // OMOD
92 .addImm(0); // NEG
93 MI->eraseFromParent();
94 break;
95
96 case AMDGPU::FABS_SI:
97 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
98 .addOperand(MI->getOperand(0))
99 .addOperand(MI->getOperand(1))
100 // VSRC1-2 are unused, but we still need to fill all the
101 // operand slots, so we just reuse the VSRC0 operand
102 .addOperand(MI->getOperand(1))
103 .addOperand(MI->getOperand(1))
104 .addImm(1) // ABS
105 .addImm(0) // CLAMP
106 .addImm(0) // OMOD
107 .addImm(0); // NEG
108 MI->eraseFromParent();
109 break;
110
111 case AMDGPU::FNEG_SI:
112 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
113 .addOperand(MI->getOperand(0))
114 .addOperand(MI->getOperand(1))
115 // VSRC1-2 are unused, but we still need to fill all the
116 // operand slots, so we just reuse the VSRC0 operand
117 .addOperand(MI->getOperand(1))
118 .addOperand(MI->getOperand(1))
119 .addImm(0) // ABS
120 .addImm(0) // CLAMP
121 .addImm(0) // OMOD
122 .addImm(1); // NEG
123 MI->eraseFromParent();
124 break;
125
126 case AMDGPU::SI_INTERP:
127 LowerSI_INTERP(MI, *BB, I, MRI);
128 break;
129 case AMDGPU::SI_INTERP_CONST:
130 LowerSI_INTERP_CONST(MI, *BB, I, MRI);
131 break;
132 case AMDGPU::SI_KIL:
133 LowerSI_KIL(MI, *BB, I, MRI);
134 break;
135 case AMDGPU::SI_V_CNDLT:
136 LowerSI_V_CNDLT(MI, *BB, I, MRI);
137 break;
138 }
139 return BB;
140 }
141
AppendS_WAITCNT(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I) const142 void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
143 MachineBasicBlock::iterator I) const
144 {
145 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
146 .addImm(0);
147 }
148
LowerSI_INTERP(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const149 void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
150 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
151 {
152 unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
153 unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
154 MachineOperand dst = MI->getOperand(0);
155 MachineOperand iReg = MI->getOperand(1);
156 MachineOperand jReg = MI->getOperand(2);
157 MachineOperand attr_chan = MI->getOperand(3);
158 MachineOperand attr = MI->getOperand(4);
159 MachineOperand params = MI->getOperand(5);
160
161 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
162 .addOperand(params);
163
164 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
165 .addOperand(iReg)
166 .addOperand(attr_chan)
167 .addOperand(attr)
168 .addReg(M0);
169
170 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
171 .addOperand(dst)
172 .addReg(tmp)
173 .addOperand(jReg)
174 .addOperand(attr_chan)
175 .addOperand(attr)
176 .addReg(M0);
177
178 MI->eraseFromParent();
179 }
180
LowerSI_INTERP_CONST(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const181 void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
182 MachineBasicBlock &BB, MachineBasicBlock::iterator I,
183 MachineRegisterInfo &MRI) const
184 {
185 MachineOperand dst = MI->getOperand(0);
186 MachineOperand attr_chan = MI->getOperand(1);
187 MachineOperand attr = MI->getOperand(2);
188 MachineOperand params = MI->getOperand(3);
189 unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
190
191 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
192 .addOperand(params);
193
194 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
195 .addOperand(dst)
196 .addOperand(attr_chan)
197 .addOperand(attr)
198 .addReg(M0);
199
200 MI->eraseFromParent();
201 }
202
LowerSI_KIL(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const203 void SITargetLowering::LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
204 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
205 {
206 // Clear this pixel from the exec mask if the operand is negative
207 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMPX_LE_F32_e32),
208 AMDGPU::VCC)
209 .addReg(AMDGPU::SREG_LIT_0)
210 .addOperand(MI->getOperand(0));
211
212 // If the exec mask is non-zero, skip the next two instructions
213 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
214 .addImm(3)
215 .addReg(AMDGPU::EXEC);
216
217 // Exec mask is zero: Export to NULL target...
218 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::EXP))
219 .addImm(0)
220 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
221 .addImm(0)
222 .addImm(1)
223 .addImm(1)
224 .addReg(AMDGPU::SREG_LIT_0)
225 .addReg(AMDGPU::SREG_LIT_0)
226 .addReg(AMDGPU::SREG_LIT_0)
227 .addReg(AMDGPU::SREG_LIT_0);
228
229 // ... and terminate wavefront
230 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
231
232 MI->eraseFromParent();
233 }
234
LowerSI_V_CNDLT(MachineInstr * MI,MachineBasicBlock & BB,MachineBasicBlock::iterator I,MachineRegisterInfo & MRI) const235 void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
236 MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
237 {
238 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMP_LT_F32_e32),
239 AMDGPU::VCC)
240 .addOperand(MI->getOperand(1))
241 .addReg(AMDGPU::SREG_LIT_0);
242
243 BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32))
244 .addOperand(MI->getOperand(0))
245 .addReg(AMDGPU::VCC)
246 .addOperand(MI->getOperand(2))
247 .addOperand(MI->getOperand(3));
248
249 MI->eraseFromParent();
250 }
251
getSetCCResultType(EVT VT) const252 EVT SITargetLowering::getSetCCResultType(EVT VT) const
253 {
254 return MVT::i1;
255 }
256
257 //===----------------------------------------------------------------------===//
258 // Custom DAG Lowering Operations
259 //===----------------------------------------------------------------------===//
260
LowerOperation(SDValue Op,SelectionDAG & DAG) const261 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
262 {
263 switch (Op.getOpcode()) {
264 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
265 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
266 case ISD::LOAD: return LowerLOAD(Op, DAG);
267 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
268 case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
269 case ISD::INTRINSIC_WO_CHAIN: {
270 unsigned IntrinsicID =
271 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
272 EVT VT = Op.getValueType();
273 switch (IntrinsicID) {
274 case AMDGPUIntrinsic::SI_vs_load_buffer_index:
275 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
276 AMDGPU::VGPR0, VT);
277 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
278 }
279 break;
280 }
281 }
282 return SDValue();
283 }
284
285 /// Loweri1ContextSwitch - The function is for lowering i1 operations on the
286 /// VCC register. In the VALU context, VCC is a one bit register, but in the
287 /// SALU context the VCC is a 64-bit register (1-bit per thread). Since only
288 /// the SALU can perform operations on the VCC register, we need to promote
289 /// the operand types from i1 to i64 in order for tablegen to be able to match
290 /// this operation to the correct SALU instruction. We do this promotion by
291 /// wrapping the operands in a CopyToReg node.
292 ///
Loweri1ContextSwitch(SDValue Op,SelectionDAG & DAG,unsigned VCCNode) const293 SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
294 SelectionDAG &DAG,
295 unsigned VCCNode) const
296 {
297 DebugLoc DL = Op.getDebugLoc();
298
299 SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
300 DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
301 Op.getOperand(0)),
302 DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
303 Op.getOperand(1)));
304
305 return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
306 }
307
LowerBR_CC(SDValue Op,SelectionDAG & DAG) const308 SDValue SITargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
309 {
310 SDValue Chain = Op.getOperand(0);
311 SDValue CC = Op.getOperand(1);
312 SDValue LHS = Op.getOperand(2);
313 SDValue RHS = Op.getOperand(3);
314 SDValue JumpT = Op.getOperand(4);
315 SDValue CmpValue;
316 SDValue Result;
317 CmpValue = DAG.getNode(
318 ISD::SETCC,
319 Op.getDebugLoc(),
320 MVT::i1,
321 LHS, RHS,
322 CC);
323
324 Result = DAG.getNode(
325 AMDGPUISD::BRANCH_COND,
326 CmpValue.getDebugLoc(),
327 MVT::Other, Chain,
328 JumpT, CmpValue);
329 return Result;
330 }
331
LowerLOAD(SDValue Op,SelectionDAG & DAG) const332 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
333 {
334 EVT VT = Op.getValueType();
335 LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
336
337 assert(Ptr);
338
339 unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
340
341 // We only need to lower USER_SGPR address space loads
342 if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
343 return SDValue();
344 }
345
346 // Loads from the USER_SGPR address space can only have constant value
347 // pointers.
348 ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
349 assert(BasePtr);
350
351 unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
352 const TargetRegisterClass * dstClass;
353 switch (TypeDwordWidth) {
354 default:
355 assert(!"USER_SGPR value size not implemented");
356 return SDValue();
357 case 1:
358 dstClass = &AMDGPU::SReg_32RegClass;
359 break;
360 case 2:
361 dstClass = &AMDGPU::SReg_64RegClass;
362 break;
363 }
364 uint64_t Index = BasePtr->getZExtValue();
365 assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
366 unsigned SGPRIndex = Index / TypeDwordWidth;
367 unsigned Reg = dstClass->getRegister(SGPRIndex);
368
369 DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
370 VT));
371 return SDValue();
372 }
373
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const374 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
375 {
376 SDValue LHS = Op.getOperand(0);
377 SDValue RHS = Op.getOperand(1);
378 SDValue True = Op.getOperand(2);
379 SDValue False = Op.getOperand(3);
380 SDValue CC = Op.getOperand(4);
381 EVT VT = Op.getValueType();
382 DebugLoc DL = Op.getDebugLoc();
383
384 SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
385 return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
386 }
387
388 //===----------------------------------------------------------------------===//
389 // Custom DAG optimizations
390 //===----------------------------------------------------------------------===//
391
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const392 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
393 DAGCombinerInfo &DCI) const {
394 SelectionDAG &DAG = DCI.DAG;
395 DebugLoc DL = N->getDebugLoc();
396 EVT VT = N->getValueType(0);
397
398 switch (N->getOpcode()) {
399 default: break;
400 case ISD::SELECT_CC: {
401 N->dump();
402 ConstantSDNode *True, *False;
403 // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
404 if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
405 && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
406 && True->isAllOnesValue()
407 && False->isNullValue()
408 && VT == MVT::i1) {
409 return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
410 N->getOperand(1), N->getOperand(4));
411
412 }
413 break;
414 }
415 case ISD::SETCC: {
416 SDValue Arg0 = N->getOperand(0);
417 SDValue Arg1 = N->getOperand(1);
418 SDValue CC = N->getOperand(2);
419 ConstantSDNode * C = NULL;
420 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
421
422 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
423 if (VT == MVT::i1
424 && Arg0.getOpcode() == ISD::SIGN_EXTEND
425 && Arg0.getOperand(0).getValueType() == MVT::i1
426 && (C = dyn_cast<ConstantSDNode>(Arg1))
427 && C->isNullValue()
428 && CCOp == ISD::SETNE) {
429 return SimplifySetCC(VT, Arg0.getOperand(0),
430 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
431 }
432 break;
433 }
434 }
435 return SDValue();
436 }
437
438 #define NODE_NAME_CASE(node) case SIISD::node: return #node;
439
getTargetNodeName(unsigned Opcode) const440 const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const
441 {
442 switch (Opcode) {
443 default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
444 NODE_NAME_CASE(VCC_AND)
445 NODE_NAME_CASE(VCC_BITCAST)
446 }
447 }
448