1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30
31 using namespace llvm;
32
R600TargetLowering(TargetMachine & TM,const AMDGPUSubtarget & STI)33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
34 const AMDGPUSubtarget &STI)
35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42
43 computeRegisterProperties(STI.getRegisterInfo());
44
45 // Set condition code actions
46 setCondCodeAction(ISD::SETO, MVT::f32, Expand);
47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58
59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63
64 setOperationAction(ISD::FCOS, MVT::f32, Custom);
65 setOperationAction(ISD::FSIN, MVT::f32, Custom);
66
67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69
70 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73
74 setOperationAction(ISD::FSUB, MVT::f32, Expand);
75
76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79
80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82
83 setOperationAction(ISD::SETCC, MVT::i32, Expand);
84 setOperationAction(ISD::SETCC, MVT::f32, Expand);
85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88
89 setOperationAction(ISD::SELECT, MVT::i32, Expand);
90 setOperationAction(ISD::SELECT, MVT::f32, Expand);
91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93
94 // ADD, SUB overflow.
95 // TODO: turn these into Legal?
96 if (Subtarget->hasCARRY())
97 setOperationAction(ISD::UADDO, MVT::i32, Custom);
98
99 if (Subtarget->hasBORROW())
100 setOperationAction(ISD::USUBO, MVT::i32, Custom);
101
102 // Expand sign extension of vectors
103 if (!Subtarget->hasBFE())
104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
105
106 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
108
109 if (!Subtarget->hasBFE())
110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
113
114 if (!Subtarget->hasBFE())
115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
118
119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
122
123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
124
125
126 // Legalize loads and stores to the private address space.
127 setOperationAction(ISD::LOAD, MVT::i32, Custom);
128 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
129 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
130
131 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
132 // spaces, so it is custom lowered to handle those where it isn't.
133 for (MVT VT : MVT::integer_valuetypes()) {
134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
137
138 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
141
142 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
143 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
145 }
146
147 setOperationAction(ISD::STORE, MVT::i8, Custom);
148 setOperationAction(ISD::STORE, MVT::i32, Custom);
149 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
150 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
151 setTruncStoreAction(MVT::i32, MVT::i8, Custom);
152 setTruncStoreAction(MVT::i32, MVT::i16, Custom);
153
154 setOperationAction(ISD::LOAD, MVT::i32, Custom);
155 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
157
158 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
159 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
160 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
161 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
162
163 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
164 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
165 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
166 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
167
168 setTargetDAGCombine(ISD::FP_ROUND);
169 setTargetDAGCombine(ISD::FP_TO_SINT);
170 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
171 setTargetDAGCombine(ISD::SELECT_CC);
172 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
173
174 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
175 // to be Legal/Custom in order to avoid library calls.
176 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
177 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
178 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
179
180 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
181
182 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
183 for (MVT VT : ScalarIntVTs) {
184 setOperationAction(ISD::ADDC, VT, Expand);
185 setOperationAction(ISD::SUBC, VT, Expand);
186 setOperationAction(ISD::ADDE, VT, Expand);
187 setOperationAction(ISD::SUBE, VT, Expand);
188 }
189
190 setSchedulingPreference(Sched::Source);
191 }
192
isEOP(MachineBasicBlock::iterator I)193 static inline bool isEOP(MachineBasicBlock::iterator I) {
194 return std::next(I)->getOpcode() == AMDGPU::RETURN;
195 }
196
EmitInstrWithCustomInserter(MachineInstr * MI,MachineBasicBlock * BB) const197 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
198 MachineInstr * MI, MachineBasicBlock * BB) const {
199 MachineFunction * MF = BB->getParent();
200 MachineRegisterInfo &MRI = MF->getRegInfo();
201 MachineBasicBlock::iterator I = *MI;
202 const R600InstrInfo *TII =
203 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
204
205 switch (MI->getOpcode()) {
206 default:
207 // Replace LDS_*_RET instruction that don't have any uses with the
208 // equivalent LDS_*_NORET instruction.
209 if (TII->isLDSRetInstr(MI->getOpcode())) {
210 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
211 assert(DstIdx != -1);
212 MachineInstrBuilder NewMI;
213 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
214 // LDS_1A2D support and remove this special case.
215 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
216 MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
217 return BB;
218
219 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
220 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
221 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
222 NewMI.addOperand(MI->getOperand(i));
223 }
224 } else {
225 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
226 }
227 break;
228 case AMDGPU::CLAMP_R600: {
229 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
230 AMDGPU::MOV,
231 MI->getOperand(0).getReg(),
232 MI->getOperand(1).getReg());
233 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
234 break;
235 }
236
237 case AMDGPU::FABS_R600: {
238 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
239 AMDGPU::MOV,
240 MI->getOperand(0).getReg(),
241 MI->getOperand(1).getReg());
242 TII->addFlag(NewMI, 0, MO_FLAG_ABS);
243 break;
244 }
245
246 case AMDGPU::FNEG_R600: {
247 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
248 AMDGPU::MOV,
249 MI->getOperand(0).getReg(),
250 MI->getOperand(1).getReg());
251 TII->addFlag(NewMI, 0, MO_FLAG_NEG);
252 break;
253 }
254
255 case AMDGPU::MASK_WRITE: {
256 unsigned maskedRegister = MI->getOperand(0).getReg();
257 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
258 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
259 TII->addFlag(defInstr, 0, MO_FLAG_MASK);
260 break;
261 }
262
263 case AMDGPU::MOV_IMM_F32:
264 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
265 MI->getOperand(1).getFPImm()->getValueAPF()
266 .bitcastToAPInt().getZExtValue());
267 break;
268 case AMDGPU::MOV_IMM_I32:
269 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
270 MI->getOperand(1).getImm());
271 break;
272 case AMDGPU::CONST_COPY: {
273 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
274 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
275 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
276 MI->getOperand(1).getImm());
277 break;
278 }
279
280 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
281 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
282 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
283 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
284 .addOperand(MI->getOperand(0))
285 .addOperand(MI->getOperand(1))
286 .addImm(isEOP(I)); // Set End of program bit
287 break;
288 }
289 case AMDGPU::RAT_STORE_TYPED_eg: {
290 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
291 .addOperand(MI->getOperand(0))
292 .addOperand(MI->getOperand(1))
293 .addOperand(MI->getOperand(2))
294 .addImm(isEOP(I)); // Set End of program bit
295 break;
296 }
297
298 case AMDGPU::TXD: {
299 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
300 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
301 MachineOperand &RID = MI->getOperand(4);
302 MachineOperand &SID = MI->getOperand(5);
303 unsigned TextureId = MI->getOperand(6).getImm();
304 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
305 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
306
307 switch (TextureId) {
308 case 5: // Rect
309 CTX = CTY = 0;
310 break;
311 case 6: // Shadow1D
312 SrcW = SrcZ;
313 break;
314 case 7: // Shadow2D
315 SrcW = SrcZ;
316 break;
317 case 8: // ShadowRect
318 CTX = CTY = 0;
319 SrcW = SrcZ;
320 break;
321 case 9: // 1DArray
322 SrcZ = SrcY;
323 CTZ = 0;
324 break;
325 case 10: // 2DArray
326 CTZ = 0;
327 break;
328 case 11: // Shadow1DArray
329 SrcZ = SrcY;
330 CTZ = 0;
331 break;
332 case 12: // Shadow2DArray
333 CTZ = 0;
334 break;
335 }
336 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
337 .addOperand(MI->getOperand(3))
338 .addImm(SrcX)
339 .addImm(SrcY)
340 .addImm(SrcZ)
341 .addImm(SrcW)
342 .addImm(0)
343 .addImm(0)
344 .addImm(0)
345 .addImm(0)
346 .addImm(1)
347 .addImm(2)
348 .addImm(3)
349 .addOperand(RID)
350 .addOperand(SID)
351 .addImm(CTX)
352 .addImm(CTY)
353 .addImm(CTZ)
354 .addImm(CTW);
355 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
356 .addOperand(MI->getOperand(2))
357 .addImm(SrcX)
358 .addImm(SrcY)
359 .addImm(SrcZ)
360 .addImm(SrcW)
361 .addImm(0)
362 .addImm(0)
363 .addImm(0)
364 .addImm(0)
365 .addImm(1)
366 .addImm(2)
367 .addImm(3)
368 .addOperand(RID)
369 .addOperand(SID)
370 .addImm(CTX)
371 .addImm(CTY)
372 .addImm(CTZ)
373 .addImm(CTW);
374 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
375 .addOperand(MI->getOperand(0))
376 .addOperand(MI->getOperand(1))
377 .addImm(SrcX)
378 .addImm(SrcY)
379 .addImm(SrcZ)
380 .addImm(SrcW)
381 .addImm(0)
382 .addImm(0)
383 .addImm(0)
384 .addImm(0)
385 .addImm(1)
386 .addImm(2)
387 .addImm(3)
388 .addOperand(RID)
389 .addOperand(SID)
390 .addImm(CTX)
391 .addImm(CTY)
392 .addImm(CTZ)
393 .addImm(CTW)
394 .addReg(T0, RegState::Implicit)
395 .addReg(T1, RegState::Implicit);
396 break;
397 }
398
399 case AMDGPU::TXD_SHADOW: {
400 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
401 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
402 MachineOperand &RID = MI->getOperand(4);
403 MachineOperand &SID = MI->getOperand(5);
404 unsigned TextureId = MI->getOperand(6).getImm();
405 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
406 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
407
408 switch (TextureId) {
409 case 5: // Rect
410 CTX = CTY = 0;
411 break;
412 case 6: // Shadow1D
413 SrcW = SrcZ;
414 break;
415 case 7: // Shadow2D
416 SrcW = SrcZ;
417 break;
418 case 8: // ShadowRect
419 CTX = CTY = 0;
420 SrcW = SrcZ;
421 break;
422 case 9: // 1DArray
423 SrcZ = SrcY;
424 CTZ = 0;
425 break;
426 case 10: // 2DArray
427 CTZ = 0;
428 break;
429 case 11: // Shadow1DArray
430 SrcZ = SrcY;
431 CTZ = 0;
432 break;
433 case 12: // Shadow2DArray
434 CTZ = 0;
435 break;
436 }
437
438 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
439 .addOperand(MI->getOperand(3))
440 .addImm(SrcX)
441 .addImm(SrcY)
442 .addImm(SrcZ)
443 .addImm(SrcW)
444 .addImm(0)
445 .addImm(0)
446 .addImm(0)
447 .addImm(0)
448 .addImm(1)
449 .addImm(2)
450 .addImm(3)
451 .addOperand(RID)
452 .addOperand(SID)
453 .addImm(CTX)
454 .addImm(CTY)
455 .addImm(CTZ)
456 .addImm(CTW);
457 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
458 .addOperand(MI->getOperand(2))
459 .addImm(SrcX)
460 .addImm(SrcY)
461 .addImm(SrcZ)
462 .addImm(SrcW)
463 .addImm(0)
464 .addImm(0)
465 .addImm(0)
466 .addImm(0)
467 .addImm(1)
468 .addImm(2)
469 .addImm(3)
470 .addOperand(RID)
471 .addOperand(SID)
472 .addImm(CTX)
473 .addImm(CTY)
474 .addImm(CTZ)
475 .addImm(CTW);
476 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
477 .addOperand(MI->getOperand(0))
478 .addOperand(MI->getOperand(1))
479 .addImm(SrcX)
480 .addImm(SrcY)
481 .addImm(SrcZ)
482 .addImm(SrcW)
483 .addImm(0)
484 .addImm(0)
485 .addImm(0)
486 .addImm(0)
487 .addImm(1)
488 .addImm(2)
489 .addImm(3)
490 .addOperand(RID)
491 .addOperand(SID)
492 .addImm(CTX)
493 .addImm(CTY)
494 .addImm(CTZ)
495 .addImm(CTW)
496 .addReg(T0, RegState::Implicit)
497 .addReg(T1, RegState::Implicit);
498 break;
499 }
500
501 case AMDGPU::BRANCH:
502 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
503 .addOperand(MI->getOperand(0));
504 break;
505
506 case AMDGPU::BRANCH_COND_f32: {
507 MachineInstr *NewMI =
508 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
509 AMDGPU::PREDICATE_BIT)
510 .addOperand(MI->getOperand(1))
511 .addImm(OPCODE_IS_NOT_ZERO)
512 .addImm(0); // Flags
513 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
514 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
515 .addOperand(MI->getOperand(0))
516 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
517 break;
518 }
519
520 case AMDGPU::BRANCH_COND_i32: {
521 MachineInstr *NewMI =
522 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
523 AMDGPU::PREDICATE_BIT)
524 .addOperand(MI->getOperand(1))
525 .addImm(OPCODE_IS_NOT_ZERO_INT)
526 .addImm(0); // Flags
527 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
528 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
529 .addOperand(MI->getOperand(0))
530 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
531 break;
532 }
533
534 case AMDGPU::EG_ExportSwz:
535 case AMDGPU::R600_ExportSwz: {
536 // Instruction is left unmodified if its not the last one of its type
537 bool isLastInstructionOfItsType = true;
538 unsigned InstExportType = MI->getOperand(1).getImm();
539 for (MachineBasicBlock::iterator NextExportInst = std::next(I),
540 EndBlock = BB->end(); NextExportInst != EndBlock;
541 NextExportInst = std::next(NextExportInst)) {
542 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
543 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
544 unsigned CurrentInstExportType = NextExportInst->getOperand(1)
545 .getImm();
546 if (CurrentInstExportType == InstExportType) {
547 isLastInstructionOfItsType = false;
548 break;
549 }
550 }
551 }
552 bool EOP = isEOP(I);
553 if (!EOP && !isLastInstructionOfItsType)
554 return BB;
555 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
556 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
557 .addOperand(MI->getOperand(0))
558 .addOperand(MI->getOperand(1))
559 .addOperand(MI->getOperand(2))
560 .addOperand(MI->getOperand(3))
561 .addOperand(MI->getOperand(4))
562 .addOperand(MI->getOperand(5))
563 .addOperand(MI->getOperand(6))
564 .addImm(CfInst)
565 .addImm(EOP);
566 break;
567 }
568 case AMDGPU::RETURN: {
569 // RETURN instructions must have the live-out registers as implicit uses,
570 // otherwise they appear dead.
571 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
572 MachineInstrBuilder MIB(*MF, MI);
573 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
574 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
575 return BB;
576 }
577 }
578
579 MI->eraseFromParent();
580 return BB;
581 }
582
583 //===----------------------------------------------------------------------===//
584 // Custom DAG Lowering Operations
585 //===----------------------------------------------------------------------===//
586
LowerOperation(SDValue Op,SelectionDAG & DAG) const587 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
588 MachineFunction &MF = DAG.getMachineFunction();
589 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
590 switch (Op.getOpcode()) {
591 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
592 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
593 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
594 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
595 case ISD::SRA_PARTS:
596 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
597 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
598 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
599 case ISD::FCOS:
600 case ISD::FSIN: return LowerTrig(Op, DAG);
601 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
602 case ISD::STORE: return LowerSTORE(Op, DAG);
603 case ISD::LOAD: {
604 SDValue Result = LowerLOAD(Op, DAG);
605 assert((!Result.getNode() ||
606 Result.getNode()->getNumValues() == 2) &&
607 "Load should return a value and a chain");
608 return Result;
609 }
610
611 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
612 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
613 case ISD::INTRINSIC_VOID: {
614 SDValue Chain = Op.getOperand(0);
615 unsigned IntrinsicID =
616 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
617 switch (IntrinsicID) {
618 case AMDGPUIntrinsic::AMDGPU_store_output: {
619 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
620 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
621 MFI->LiveOuts.push_back(Reg);
622 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
623 }
624 case AMDGPUIntrinsic::R600_store_swizzle: {
625 SDLoc DL(Op);
626 const SDValue Args[8] = {
627 Chain,
628 Op.getOperand(2), // Export Value
629 Op.getOperand(3), // ArrayBase
630 Op.getOperand(4), // Type
631 DAG.getConstant(0, DL, MVT::i32), // SWZ_X
632 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
633 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
634 DAG.getConstant(3, DL, MVT::i32) // SWZ_W
635 };
636 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
637 }
638
639 // default for switch(IntrinsicID)
640 default: break;
641 }
642 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
643 break;
644 }
645 case ISD::INTRINSIC_WO_CHAIN: {
646 unsigned IntrinsicID =
647 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
648 EVT VT = Op.getValueType();
649 SDLoc DL(Op);
650 switch(IntrinsicID) {
651 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
652 case AMDGPUIntrinsic::R600_load_input: {
653 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
654 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
655 MachineFunction &MF = DAG.getMachineFunction();
656 MachineRegisterInfo &MRI = MF.getRegInfo();
657 MRI.addLiveIn(Reg);
658 return DAG.getCopyFromReg(DAG.getEntryNode(),
659 SDLoc(DAG.getEntryNode()), Reg, VT);
660 }
661
662 case AMDGPUIntrinsic::R600_interp_input: {
663 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
664 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
665 MachineSDNode *interp;
666 if (ijb < 0) {
667 const R600InstrInfo *TII =
668 static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
669 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
670 MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
671 return DAG.getTargetExtractSubreg(
672 TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
673 DL, MVT::f32, SDValue(interp, 0));
674 }
675 MachineFunction &MF = DAG.getMachineFunction();
676 MachineRegisterInfo &MRI = MF.getRegInfo();
677 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
678 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
679 MRI.addLiveIn(RegisterI);
680 MRI.addLiveIn(RegisterJ);
681 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
682 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
683 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
684 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
685
686 if (slot % 4 < 2)
687 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
688 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
689 RegisterJNode, RegisterINode);
690 else
691 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
692 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
693 RegisterJNode, RegisterINode);
694 return SDValue(interp, slot % 2);
695 }
696 case AMDGPUIntrinsic::R600_interp_xy:
697 case AMDGPUIntrinsic::R600_interp_zw: {
698 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
699 MachineSDNode *interp;
700 SDValue RegisterINode = Op.getOperand(2);
701 SDValue RegisterJNode = Op.getOperand(3);
702
703 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
704 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
705 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
706 RegisterJNode, RegisterINode);
707 else
708 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
709 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
710 RegisterJNode, RegisterINode);
711 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
712 SDValue(interp, 0), SDValue(interp, 1));
713 }
714 case AMDGPUIntrinsic::R600_tex:
715 case AMDGPUIntrinsic::R600_texc:
716 case AMDGPUIntrinsic::R600_txl:
717 case AMDGPUIntrinsic::R600_txlc:
718 case AMDGPUIntrinsic::R600_txb:
719 case AMDGPUIntrinsic::R600_txbc:
720 case AMDGPUIntrinsic::R600_txf:
721 case AMDGPUIntrinsic::R600_txq:
722 case AMDGPUIntrinsic::R600_ddx:
723 case AMDGPUIntrinsic::R600_ddy:
724 case AMDGPUIntrinsic::R600_ldptr: {
725 unsigned TextureOp;
726 switch (IntrinsicID) {
727 case AMDGPUIntrinsic::R600_tex:
728 TextureOp = 0;
729 break;
730 case AMDGPUIntrinsic::R600_texc:
731 TextureOp = 1;
732 break;
733 case AMDGPUIntrinsic::R600_txl:
734 TextureOp = 2;
735 break;
736 case AMDGPUIntrinsic::R600_txlc:
737 TextureOp = 3;
738 break;
739 case AMDGPUIntrinsic::R600_txb:
740 TextureOp = 4;
741 break;
742 case AMDGPUIntrinsic::R600_txbc:
743 TextureOp = 5;
744 break;
745 case AMDGPUIntrinsic::R600_txf:
746 TextureOp = 6;
747 break;
748 case AMDGPUIntrinsic::R600_txq:
749 TextureOp = 7;
750 break;
751 case AMDGPUIntrinsic::R600_ddx:
752 TextureOp = 8;
753 break;
754 case AMDGPUIntrinsic::R600_ddy:
755 TextureOp = 9;
756 break;
757 case AMDGPUIntrinsic::R600_ldptr:
758 TextureOp = 10;
759 break;
760 default:
761 llvm_unreachable("Unknow Texture Operation");
762 }
763
764 SDValue TexArgs[19] = {
765 DAG.getConstant(TextureOp, DL, MVT::i32),
766 Op.getOperand(1),
767 DAG.getConstant(0, DL, MVT::i32),
768 DAG.getConstant(1, DL, MVT::i32),
769 DAG.getConstant(2, DL, MVT::i32),
770 DAG.getConstant(3, DL, MVT::i32),
771 Op.getOperand(2),
772 Op.getOperand(3),
773 Op.getOperand(4),
774 DAG.getConstant(0, DL, MVT::i32),
775 DAG.getConstant(1, DL, MVT::i32),
776 DAG.getConstant(2, DL, MVT::i32),
777 DAG.getConstant(3, DL, MVT::i32),
778 Op.getOperand(5),
779 Op.getOperand(6),
780 Op.getOperand(7),
781 Op.getOperand(8),
782 Op.getOperand(9),
783 Op.getOperand(10)
784 };
785 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
786 }
787 case AMDGPUIntrinsic::AMDGPU_dp4: {
788 SDValue Args[8] = {
789 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
790 DAG.getConstant(0, DL, MVT::i32)),
791 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
792 DAG.getConstant(0, DL, MVT::i32)),
793 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
794 DAG.getConstant(1, DL, MVT::i32)),
795 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
796 DAG.getConstant(1, DL, MVT::i32)),
797 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
798 DAG.getConstant(2, DL, MVT::i32)),
799 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
800 DAG.getConstant(2, DL, MVT::i32)),
801 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
802 DAG.getConstant(3, DL, MVT::i32)),
803 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
804 DAG.getConstant(3, DL, MVT::i32))
805 };
806 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
807 }
808
809 case Intrinsic::r600_read_ngroups_x:
810 return LowerImplicitParameter(DAG, VT, DL, 0);
811 case Intrinsic::r600_read_ngroups_y:
812 return LowerImplicitParameter(DAG, VT, DL, 1);
813 case Intrinsic::r600_read_ngroups_z:
814 return LowerImplicitParameter(DAG, VT, DL, 2);
815 case Intrinsic::r600_read_global_size_x:
816 return LowerImplicitParameter(DAG, VT, DL, 3);
817 case Intrinsic::r600_read_global_size_y:
818 return LowerImplicitParameter(DAG, VT, DL, 4);
819 case Intrinsic::r600_read_global_size_z:
820 return LowerImplicitParameter(DAG, VT, DL, 5);
821 case Intrinsic::r600_read_local_size_x:
822 return LowerImplicitParameter(DAG, VT, DL, 6);
823 case Intrinsic::r600_read_local_size_y:
824 return LowerImplicitParameter(DAG, VT, DL, 7);
825 case Intrinsic::r600_read_local_size_z:
826 return LowerImplicitParameter(DAG, VT, DL, 8);
827
828 case Intrinsic::AMDGPU_read_workdim: {
829 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
830 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
831 }
832
833 case Intrinsic::r600_read_tgid_x:
834 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
835 AMDGPU::T1_X, VT);
836 case Intrinsic::r600_read_tgid_y:
837 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
838 AMDGPU::T1_Y, VT);
839 case Intrinsic::r600_read_tgid_z:
840 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
841 AMDGPU::T1_Z, VT);
842 case Intrinsic::r600_read_tidig_x:
843 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
844 AMDGPU::T0_X, VT);
845 case Intrinsic::r600_read_tidig_y:
846 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
847 AMDGPU::T0_Y, VT);
848 case Intrinsic::r600_read_tidig_z:
849 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
850 AMDGPU::T0_Z, VT);
851 case Intrinsic::AMDGPU_rsq:
852 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
853 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
854
855 case AMDGPUIntrinsic::AMDGPU_fract:
856 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
857 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
858 }
859 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
860 break;
861 }
862 } // end switch(Op.getOpcode())
863 return SDValue();
864 }
865
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const866 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
867 SmallVectorImpl<SDValue> &Results,
868 SelectionDAG &DAG) const {
869 switch (N->getOpcode()) {
870 default:
871 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
872 return;
873 case ISD::FP_TO_UINT:
874 if (N->getValueType(0) == MVT::i1) {
875 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
876 return;
877 }
878 // Fall-through. Since we don't care about out of bounds values
879 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
880 // considers some extra cases which are not necessary here.
881 case ISD::FP_TO_SINT: {
882 SDValue Result;
883 if (expandFP_TO_SINT(N, Result, DAG))
884 Results.push_back(Result);
885 return;
886 }
887 case ISD::SDIVREM: {
888 SDValue Op = SDValue(N, 1);
889 SDValue RES = LowerSDIVREM(Op, DAG);
890 Results.push_back(RES);
891 Results.push_back(RES.getValue(1));
892 break;
893 }
894 case ISD::UDIVREM: {
895 SDValue Op = SDValue(N, 0);
896 LowerUDIVREM64(Op, DAG, Results);
897 break;
898 }
899 }
900 }
901
vectorToVerticalVector(SelectionDAG & DAG,SDValue Vector) const902 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
903 SDValue Vector) const {
904
905 SDLoc DL(Vector);
906 EVT VecVT = Vector.getValueType();
907 EVT EltVT = VecVT.getVectorElementType();
908 SmallVector<SDValue, 8> Args;
909
910 for (unsigned i = 0, e = VecVT.getVectorNumElements();
911 i != e; ++i) {
912 Args.push_back(DAG.getNode(
913 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
914 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
915 }
916
917 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
918 }
919
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const920 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
921 SelectionDAG &DAG) const {
922
923 SDLoc DL(Op);
924 SDValue Vector = Op.getOperand(0);
925 SDValue Index = Op.getOperand(1);
926
927 if (isa<ConstantSDNode>(Index) ||
928 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
929 return Op;
930
931 Vector = vectorToVerticalVector(DAG, Vector);
932 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
933 Vector, Index);
934 }
935
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const936 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
937 SelectionDAG &DAG) const {
938 SDLoc DL(Op);
939 SDValue Vector = Op.getOperand(0);
940 SDValue Value = Op.getOperand(1);
941 SDValue Index = Op.getOperand(2);
942
943 if (isa<ConstantSDNode>(Index) ||
944 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
945 return Op;
946
947 Vector = vectorToVerticalVector(DAG, Vector);
948 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
949 Vector, Value, Index);
950 return vectorToVerticalVector(DAG, Insert);
951 }
952
LowerTrig(SDValue Op,SelectionDAG & DAG) const953 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
954 // On hw >= R700, COS/SIN input must be between -1. and 1.
955 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
956 EVT VT = Op.getValueType();
957 SDValue Arg = Op.getOperand(0);
958 SDLoc DL(Op);
959
960 // TODO: Should this propagate fast-math-flags?
961 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
962 DAG.getNode(ISD::FADD, DL, VT,
963 DAG.getNode(ISD::FMUL, DL, VT, Arg,
964 DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
965 DAG.getConstantFP(0.5, DL, MVT::f32)));
966 unsigned TrigNode;
967 switch (Op.getOpcode()) {
968 case ISD::FCOS:
969 TrigNode = AMDGPUISD::COS_HW;
970 break;
971 case ISD::FSIN:
972 TrigNode = AMDGPUISD::SIN_HW;
973 break;
974 default:
975 llvm_unreachable("Wrong trig opcode");
976 }
977 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
978 DAG.getNode(ISD::FADD, DL, VT, FractPart,
979 DAG.getConstantFP(-0.5, DL, MVT::f32)));
980 if (Gen >= AMDGPUSubtarget::R700)
981 return TrigVal;
982 // On R600 hw, COS/SIN input must be between -Pi and Pi.
983 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
984 DAG.getConstantFP(3.14159265359, DL, MVT::f32));
985 }
986
LowerSHLParts(SDValue Op,SelectionDAG & DAG) const987 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
988 SDLoc DL(Op);
989 EVT VT = Op.getValueType();
990
991 SDValue Lo = Op.getOperand(0);
992 SDValue Hi = Op.getOperand(1);
993 SDValue Shift = Op.getOperand(2);
994 SDValue Zero = DAG.getConstant(0, DL, VT);
995 SDValue One = DAG.getConstant(1, DL, VT);
996
997 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
998 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
999 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1000 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1001
1002 // The dance around Width1 is necessary for 0 special case.
1003 // Without it the CompShift might be 32, producing incorrect results in
1004 // Overflow. So we do the shift in two steps, the alternative is to
1005 // add a conditional to filter the special case.
1006
1007 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1008 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1009
1010 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1011 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1012 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1013
1014 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1015 SDValue LoBig = Zero;
1016
1017 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1018 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1019
1020 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1021 }
1022
LowerSRXParts(SDValue Op,SelectionDAG & DAG) const1023 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1024 SDLoc DL(Op);
1025 EVT VT = Op.getValueType();
1026
1027 SDValue Lo = Op.getOperand(0);
1028 SDValue Hi = Op.getOperand(1);
1029 SDValue Shift = Op.getOperand(2);
1030 SDValue Zero = DAG.getConstant(0, DL, VT);
1031 SDValue One = DAG.getConstant(1, DL, VT);
1032
1033 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1034
1035 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1036 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1037 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1038 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1039
1040 // The dance around Width1 is necessary for 0 special case.
1041 // Without it the CompShift might be 32, producing incorrect results in
1042 // Overflow. So we do the shift in two steps, the alternative is to
1043 // add a conditional to filter the special case.
1044
1045 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1046 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1047
1048 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1049 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1050 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1051
1052 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1053 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1054
1055 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1056 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1057
1058 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1059 }
1060
LowerUADDSUBO(SDValue Op,SelectionDAG & DAG,unsigned mainop,unsigned ovf) const1061 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1062 unsigned mainop, unsigned ovf) const {
1063 SDLoc DL(Op);
1064 EVT VT = Op.getValueType();
1065
1066 SDValue Lo = Op.getOperand(0);
1067 SDValue Hi = Op.getOperand(1);
1068
1069 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1070 // Extend sign.
1071 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1072 DAG.getValueType(MVT::i1));
1073
1074 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1075
1076 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1077 }
1078
LowerFPTOUINT(SDValue Op,SelectionDAG & DAG) const1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1080 SDLoc DL(Op);
1081 return DAG.getNode(
1082 ISD::SETCC,
1083 DL,
1084 MVT::i1,
1085 Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1086 DAG.getCondCode(ISD::SETNE)
1087 );
1088 }
1089
LowerImplicitParameter(SelectionDAG & DAG,EVT VT,SDLoc DL,unsigned DwordOffset) const1090 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1091 SDLoc DL,
1092 unsigned DwordOffset) const {
1093 unsigned ByteOffset = DwordOffset * 4;
1094 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1095 AMDGPUAS::CONSTANT_BUFFER_0);
1096
1097 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1098 assert(isInt<16>(ByteOffset));
1099
1100 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1101 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1102 MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1103 false, false, false, 0);
1104 }
1105
isZero(SDValue Op) const1106 bool R600TargetLowering::isZero(SDValue Op) const {
1107 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1108 return Cst->isNullValue();
1109 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1110 return CstFP->isZero();
1111 } else {
1112 return false;
1113 }
1114 }
1115
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const1116 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1117 SDLoc DL(Op);
1118 EVT VT = Op.getValueType();
1119
1120 SDValue LHS = Op.getOperand(0);
1121 SDValue RHS = Op.getOperand(1);
1122 SDValue True = Op.getOperand(2);
1123 SDValue False = Op.getOperand(3);
1124 SDValue CC = Op.getOperand(4);
1125 SDValue Temp;
1126
1127 if (VT == MVT::f32) {
1128 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1129 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1130 if (MinMax)
1131 return MinMax;
1132 }
1133
1134 // LHS and RHS are guaranteed to be the same value type
1135 EVT CompareVT = LHS.getValueType();
1136
1137 // Check if we can lower this to a native operation.
1138
1139 // Try to lower to a SET* instruction:
1140 //
1141 // SET* can match the following patterns:
1142 //
1143 // select_cc f32, f32, -1, 0, cc_supported
1144 // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1145 // select_cc i32, i32, -1, 0, cc_supported
1146 //
1147
1148 // Move hardware True/False values to the correct operand.
1149 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1150 ISD::CondCode InverseCC =
1151 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1152 if (isHWTrueValue(False) && isHWFalseValue(True)) {
1153 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1154 std::swap(False, True);
1155 CC = DAG.getCondCode(InverseCC);
1156 } else {
1157 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1158 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1159 std::swap(False, True);
1160 std::swap(LHS, RHS);
1161 CC = DAG.getCondCode(SwapInvCC);
1162 }
1163 }
1164 }
1165
1166 if (isHWTrueValue(True) && isHWFalseValue(False) &&
1167 (CompareVT == VT || VT == MVT::i32)) {
1168 // This can be matched by a SET* instruction.
1169 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1170 }
1171
1172 // Try to lower to a CND* instruction:
1173 //
1174 // CND* can match the following patterns:
1175 //
1176 // select_cc f32, 0.0, f32, f32, cc_supported
1177 // select_cc f32, 0.0, i32, i32, cc_supported
1178 // select_cc i32, 0, f32, f32, cc_supported
1179 // select_cc i32, 0, i32, i32, cc_supported
1180 //
1181
1182 // Try to move the zero value to the RHS
1183 if (isZero(LHS)) {
1184 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1185 // Try swapping the operands
1186 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1187 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1188 std::swap(LHS, RHS);
1189 CC = DAG.getCondCode(CCSwapped);
1190 } else {
1191 // Try inverting the conditon and then swapping the operands
1192 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1193 CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1194 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1195 std::swap(True, False);
1196 std::swap(LHS, RHS);
1197 CC = DAG.getCondCode(CCSwapped);
1198 }
1199 }
1200 }
1201 if (isZero(RHS)) {
1202 SDValue Cond = LHS;
1203 SDValue Zero = RHS;
1204 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1205 if (CompareVT != VT) {
1206 // Bitcast True / False to the correct types. This will end up being
1207 // a nop, but it allows us to define only a single pattern in the
1208 // .TD files for each CND* instruction rather than having to have
1209 // one pattern for integer True/False and one for fp True/False
1210 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1211 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1212 }
1213
1214 switch (CCOpcode) {
1215 case ISD::SETONE:
1216 case ISD::SETUNE:
1217 case ISD::SETNE:
1218 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1219 Temp = True;
1220 True = False;
1221 False = Temp;
1222 break;
1223 default:
1224 break;
1225 }
1226 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1227 Cond, Zero,
1228 True, False,
1229 DAG.getCondCode(CCOpcode));
1230 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1231 }
1232
1233 // If we make it this for it means we have no native instructions to handle
1234 // this SELECT_CC, so we must lower it.
1235 SDValue HWTrue, HWFalse;
1236
1237 if (CompareVT == MVT::f32) {
1238 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1239 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1240 } else if (CompareVT == MVT::i32) {
1241 HWTrue = DAG.getConstant(-1, DL, CompareVT);
1242 HWFalse = DAG.getConstant(0, DL, CompareVT);
1243 }
1244 else {
1245 llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1246 }
1247
1248 // Lower this unsupported SELECT_CC into a combination of two supported
1249 // SELECT_CC operations.
1250 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1251
1252 return DAG.getNode(ISD::SELECT_CC, DL, VT,
1253 Cond, HWFalse,
1254 True, False,
1255 DAG.getCondCode(ISD::SETNE));
1256 }
1257
1258 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to
1259 /// convert these pointers to a register index. Each register holds
1260 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1261 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1262 /// for indirect addressing.
stackPtrToRegIndex(SDValue Ptr,unsigned StackWidth,SelectionDAG & DAG) const1263 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1264 unsigned StackWidth,
1265 SelectionDAG &DAG) const {
1266 unsigned SRLPad;
1267 switch(StackWidth) {
1268 case 1:
1269 SRLPad = 2;
1270 break;
1271 case 2:
1272 SRLPad = 3;
1273 break;
1274 case 4:
1275 SRLPad = 4;
1276 break;
1277 default: llvm_unreachable("Invalid stack width");
1278 }
1279
1280 SDLoc DL(Ptr);
1281 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1282 DAG.getConstant(SRLPad, DL, MVT::i32));
1283 }
1284
getStackAddress(unsigned StackWidth,unsigned ElemIdx,unsigned & Channel,unsigned & PtrIncr) const1285 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1286 unsigned ElemIdx,
1287 unsigned &Channel,
1288 unsigned &PtrIncr) const {
1289 switch (StackWidth) {
1290 default:
1291 case 1:
1292 Channel = 0;
1293 if (ElemIdx > 0) {
1294 PtrIncr = 1;
1295 } else {
1296 PtrIncr = 0;
1297 }
1298 break;
1299 case 2:
1300 Channel = ElemIdx % 2;
1301 if (ElemIdx == 2) {
1302 PtrIncr = 1;
1303 } else {
1304 PtrIncr = 0;
1305 }
1306 break;
1307 case 4:
1308 Channel = ElemIdx;
1309 PtrIncr = 0;
1310 break;
1311 }
1312 }
1313
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1314 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1315 SDLoc DL(Op);
1316 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1317 SDValue Chain = Op.getOperand(0);
1318 SDValue Value = Op.getOperand(1);
1319 SDValue Ptr = Op.getOperand(2);
1320
1321 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1322 if (Result.getNode()) {
1323 return Result;
1324 }
1325
1326 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1327 if (StoreNode->isTruncatingStore()) {
1328 EVT VT = Value.getValueType();
1329 assert(VT.bitsLE(MVT::i32));
1330 EVT MemVT = StoreNode->getMemoryVT();
1331 SDValue MaskConstant;
1332 if (MemVT == MVT::i8) {
1333 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1334 } else {
1335 assert(MemVT == MVT::i16);
1336 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1337 }
1338 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1339 DAG.getConstant(2, DL, MVT::i32));
1340 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1341 DAG.getConstant(0x00000003, DL, VT));
1342 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1343 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1344 DAG.getConstant(3, DL, VT));
1345 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1346 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1347 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1348 // vector instead.
1349 SDValue Src[4] = {
1350 ShiftedValue,
1351 DAG.getConstant(0, DL, MVT::i32),
1352 DAG.getConstant(0, DL, MVT::i32),
1353 Mask
1354 };
1355 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1356 SDValue Args[3] = { Chain, Input, DWordAddr };
1357 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1358 Op->getVTList(), Args, MemVT,
1359 StoreNode->getMemOperand());
1360 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1361 Value.getValueType().bitsGE(MVT::i32)) {
1362 // Convert pointer from byte address to dword address.
1363 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1364 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1365 Ptr, DAG.getConstant(2, DL, MVT::i32)));
1366
1367 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1368 llvm_unreachable("Truncated and indexed stores not supported yet");
1369 } else {
1370 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1371 }
1372 return Chain;
1373 }
1374 }
1375
1376 EVT ValueVT = Value.getValueType();
1377
1378 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1379 return SDValue();
1380 }
1381
1382 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1383 if (Ret.getNode()) {
1384 return Ret;
1385 }
1386 // Lowering for indirect addressing
1387
1388 const MachineFunction &MF = DAG.getMachineFunction();
1389 const AMDGPUFrameLowering *TFL =
1390 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1391 unsigned StackWidth = TFL->getStackWidth(MF);
1392
1393 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1394
1395 if (ValueVT.isVector()) {
1396 unsigned NumElemVT = ValueVT.getVectorNumElements();
1397 EVT ElemVT = ValueVT.getVectorElementType();
1398 SmallVector<SDValue, 4> Stores(NumElemVT);
1399
1400 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1401 "vector width in load");
1402
1403 for (unsigned i = 0; i < NumElemVT; ++i) {
1404 unsigned Channel, PtrIncr;
1405 getStackAddress(StackWidth, i, Channel, PtrIncr);
1406 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1407 DAG.getConstant(PtrIncr, DL, MVT::i32));
1408 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1409 Value, DAG.getConstant(i, DL, MVT::i32));
1410
1411 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1412 Chain, Elem, Ptr,
1413 DAG.getTargetConstant(Channel, DL, MVT::i32));
1414 }
1415 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1416 } else {
1417 if (ValueVT == MVT::i8) {
1418 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1419 }
1420 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1421 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1422 }
1423
1424 return Chain;
1425 }
1426
1427 // return (512 + (kc_bank << 12)
1428 static int
ConstantAddressBlock(unsigned AddressSpace)1429 ConstantAddressBlock(unsigned AddressSpace) {
1430 switch (AddressSpace) {
1431 case AMDGPUAS::CONSTANT_BUFFER_0:
1432 return 512;
1433 case AMDGPUAS::CONSTANT_BUFFER_1:
1434 return 512 + 4096;
1435 case AMDGPUAS::CONSTANT_BUFFER_2:
1436 return 512 + 4096 * 2;
1437 case AMDGPUAS::CONSTANT_BUFFER_3:
1438 return 512 + 4096 * 3;
1439 case AMDGPUAS::CONSTANT_BUFFER_4:
1440 return 512 + 4096 * 4;
1441 case AMDGPUAS::CONSTANT_BUFFER_5:
1442 return 512 + 4096 * 5;
1443 case AMDGPUAS::CONSTANT_BUFFER_6:
1444 return 512 + 4096 * 6;
1445 case AMDGPUAS::CONSTANT_BUFFER_7:
1446 return 512 + 4096 * 7;
1447 case AMDGPUAS::CONSTANT_BUFFER_8:
1448 return 512 + 4096 * 8;
1449 case AMDGPUAS::CONSTANT_BUFFER_9:
1450 return 512 + 4096 * 9;
1451 case AMDGPUAS::CONSTANT_BUFFER_10:
1452 return 512 + 4096 * 10;
1453 case AMDGPUAS::CONSTANT_BUFFER_11:
1454 return 512 + 4096 * 11;
1455 case AMDGPUAS::CONSTANT_BUFFER_12:
1456 return 512 + 4096 * 12;
1457 case AMDGPUAS::CONSTANT_BUFFER_13:
1458 return 512 + 4096 * 13;
1459 case AMDGPUAS::CONSTANT_BUFFER_14:
1460 return 512 + 4096 * 14;
1461 case AMDGPUAS::CONSTANT_BUFFER_15:
1462 return 512 + 4096 * 15;
1463 default:
1464 return -1;
1465 }
1466 }
1467
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1468 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1469 {
1470 EVT VT = Op.getValueType();
1471 SDLoc DL(Op);
1472 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1473 SDValue Chain = Op.getOperand(0);
1474 SDValue Ptr = Op.getOperand(1);
1475 SDValue LoweredLoad;
1476
1477 if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
1478 return Ret;
1479
1480 // Lower loads constant address space global variable loads
1481 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1482 isa<GlobalVariable>(GetUnderlyingObject(
1483 LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
1484
1485 SDValue Ptr = DAG.getZExtOrTrunc(
1486 LoadNode->getBasePtr(), DL,
1487 getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
1488 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1489 DAG.getConstant(2, DL, MVT::i32));
1490 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1491 LoadNode->getChain(), Ptr,
1492 DAG.getTargetConstant(0, DL, MVT::i32),
1493 Op.getOperand(2));
1494 }
1495
1496 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1497 SDValue MergedValues[2] = {
1498 ScalarizeVectorLoad(Op, DAG),
1499 Chain
1500 };
1501 return DAG.getMergeValues(MergedValues, DL);
1502 }
1503
1504 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1505 if (ConstantBlock > -1 &&
1506 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1507 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1508 SDValue Result;
1509 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1510 isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1511 isa<ConstantSDNode>(Ptr)) {
1512 SDValue Slots[4];
1513 for (unsigned i = 0; i < 4; i++) {
1514 // We want Const position encoded with the following formula :
1515 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1516 // const_index is Ptr computed by llvm using an alignment of 16.
1517 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1518 // then div by 4 at the ISel step
1519 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1520 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1521 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1522 }
1523 EVT NewVT = MVT::v4i32;
1524 unsigned NumElements = 4;
1525 if (VT.isVector()) {
1526 NewVT = VT;
1527 NumElements = VT.getVectorNumElements();
1528 }
1529 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1530 makeArrayRef(Slots, NumElements));
1531 } else {
1532 // non-constant ptr can't be folded, keeps it as a v4f32 load
1533 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1534 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1535 DAG.getConstant(4, DL, MVT::i32)),
1536 DAG.getConstant(LoadNode->getAddressSpace() -
1537 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1538 );
1539 }
1540
1541 if (!VT.isVector()) {
1542 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1543 DAG.getConstant(0, DL, MVT::i32));
1544 }
1545
1546 SDValue MergedValues[2] = {
1547 Result,
1548 Chain
1549 };
1550 return DAG.getMergeValues(MergedValues, DL);
1551 }
1552
1553 // For most operations returning SDValue() will result in the node being
1554 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1555 // need to manually expand loads that may be legal in some address spaces and
1556 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1557 // compute shaders, since the data is sign extended when it is uploaded to the
1558 // buffer. However SEXT loads from other address spaces are not supported, so
1559 // we need to expand them here.
1560 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1561 EVT MemVT = LoadNode->getMemoryVT();
1562 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1563 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1564 LoadNode->getPointerInfo(), MemVT,
1565 LoadNode->isVolatile(),
1566 LoadNode->isNonTemporal(),
1567 LoadNode->isInvariant(),
1568 LoadNode->getAlignment());
1569 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1570 DAG.getValueType(MemVT));
1571
1572 SDValue MergedValues[2] = { Res, Chain };
1573 return DAG.getMergeValues(MergedValues, DL);
1574 }
1575
1576 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1577 return SDValue();
1578 }
1579
1580 // Lowering for indirect addressing
1581 const MachineFunction &MF = DAG.getMachineFunction();
1582 const AMDGPUFrameLowering *TFL =
1583 static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1584 unsigned StackWidth = TFL->getStackWidth(MF);
1585
1586 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1587
1588 if (VT.isVector()) {
1589 unsigned NumElemVT = VT.getVectorNumElements();
1590 EVT ElemVT = VT.getVectorElementType();
1591 SDValue Loads[4];
1592
1593 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1594 "vector width in load");
1595
1596 for (unsigned i = 0; i < NumElemVT; ++i) {
1597 unsigned Channel, PtrIncr;
1598 getStackAddress(StackWidth, i, Channel, PtrIncr);
1599 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1600 DAG.getConstant(PtrIncr, DL, MVT::i32));
1601 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1602 Chain, Ptr,
1603 DAG.getTargetConstant(Channel, DL, MVT::i32),
1604 Op.getOperand(2));
1605 }
1606 for (unsigned i = NumElemVT; i < 4; ++i) {
1607 Loads[i] = DAG.getUNDEF(ElemVT);
1608 }
1609 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1610 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1611 } else {
1612 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1613 Chain, Ptr,
1614 DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1615 Op.getOperand(2));
1616 }
1617
1618 SDValue Ops[2] = {
1619 LoweredLoad,
1620 Chain
1621 };
1622
1623 return DAG.getMergeValues(Ops, DL);
1624 }
1625
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const1626 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1627 SDValue Chain = Op.getOperand(0);
1628 SDValue Cond = Op.getOperand(1);
1629 SDValue Jump = Op.getOperand(2);
1630
1631 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1632 Chain, Jump, Cond);
1633 }
1634
1635 /// XXX Only kernel functions are supported, so we can assume for now that
1636 /// every function is a kernel function, but in the future we should use
1637 /// separate calling conventions for kernel and non-kernel functions.
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,SDLoc DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1638 SDValue R600TargetLowering::LowerFormalArguments(
1639 SDValue Chain,
1640 CallingConv::ID CallConv,
1641 bool isVarArg,
1642 const SmallVectorImpl<ISD::InputArg> &Ins,
1643 SDLoc DL, SelectionDAG &DAG,
1644 SmallVectorImpl<SDValue> &InVals) const {
1645 SmallVector<CCValAssign, 16> ArgLocs;
1646 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1647 *DAG.getContext());
1648 MachineFunction &MF = DAG.getMachineFunction();
1649 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1650
1651 SmallVector<ISD::InputArg, 8> LocalIns;
1652
1653 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1654
1655 AnalyzeFormalArguments(CCInfo, LocalIns);
1656
1657 for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1658 CCValAssign &VA = ArgLocs[i];
1659 const ISD::InputArg &In = Ins[i];
1660 EVT VT = In.VT;
1661 EVT MemVT = VA.getLocVT();
1662 if (!VT.isVector() && MemVT.isVector()) {
1663 // Get load source type if scalarized.
1664 MemVT = MemVT.getVectorElementType();
1665 }
1666
1667 if (MFI->getShaderType() != ShaderType::COMPUTE) {
1668 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1669 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1670 InVals.push_back(Register);
1671 continue;
1672 }
1673
1674 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1675 AMDGPUAS::CONSTANT_BUFFER_0);
1676
1677 // i64 isn't a legal type, so the register type used ends up as i32, which
1678 // isn't expected here. It attempts to create this sextload, but it ends up
1679 // being invalid. Somehow this seems to work with i64 arguments, but breaks
1680 // for <1 x i64>.
1681
1682 // The first 36 bytes of the input buffer contains information about
1683 // thread group and global sizes.
1684 ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1685 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1686 // FIXME: This should really check the extload type, but the handling of
1687 // extload vector parameters seems to be broken.
1688
1689 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1690 Ext = ISD::SEXTLOAD;
1691 }
1692
1693 // Compute the offset from the value.
1694 // XXX - I think PartOffset should give you this, but it seems to give the
1695 // size of the register which isn't useful.
1696
1697 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1698 unsigned PartOffset = VA.getLocMemOffset();
1699 unsigned Offset = 36 + VA.getLocMemOffset();
1700
1701 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1702 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1703 DAG.getConstant(Offset, DL, MVT::i32),
1704 DAG.getUNDEF(MVT::i32),
1705 PtrInfo,
1706 MemVT, false, true, true, 4);
1707
1708 // 4 is the preferred alignment for the CONSTANT memory space.
1709 InVals.push_back(Arg);
1710 MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1711 }
1712 return Chain;
1713 }
1714
getSetCCResultType(const DataLayout & DL,LLVMContext &,EVT VT) const1715 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1716 EVT VT) const {
1717 if (!VT.isVector())
1718 return MVT::i32;
1719 return VT.changeVectorElementTypeToInteger();
1720 }
1721
CompactSwizzlableVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1722 static SDValue CompactSwizzlableVector(
1723 SelectionDAG &DAG, SDValue VectorEntry,
1724 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1725 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1726 assert(RemapSwizzle.empty());
1727 SDValue NewBldVec[4] = {
1728 VectorEntry.getOperand(0),
1729 VectorEntry.getOperand(1),
1730 VectorEntry.getOperand(2),
1731 VectorEntry.getOperand(3)
1732 };
1733
1734 for (unsigned i = 0; i < 4; i++) {
1735 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1736 // We mask write here to teach later passes that the ith element of this
1737 // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1738 // break false dependencies and additionnaly make assembly easier to read.
1739 RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1740 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1741 if (C->isZero()) {
1742 RemapSwizzle[i] = 4; // SEL_0
1743 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1744 } else if (C->isExactlyValue(1.0)) {
1745 RemapSwizzle[i] = 5; // SEL_1
1746 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1747 }
1748 }
1749
1750 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1751 continue;
1752 for (unsigned j = 0; j < i; j++) {
1753 if (NewBldVec[i] == NewBldVec[j]) {
1754 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1755 RemapSwizzle[i] = j;
1756 break;
1757 }
1758 }
1759 }
1760
1761 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1762 VectorEntry.getValueType(), NewBldVec);
1763 }
1764
ReorganizeVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1765 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1766 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1767 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1768 assert(RemapSwizzle.empty());
1769 SDValue NewBldVec[4] = {
1770 VectorEntry.getOperand(0),
1771 VectorEntry.getOperand(1),
1772 VectorEntry.getOperand(2),
1773 VectorEntry.getOperand(3)
1774 };
1775 bool isUnmovable[4] = { false, false, false, false };
1776 for (unsigned i = 0; i < 4; i++) {
1777 RemapSwizzle[i] = i;
1778 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1779 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1780 ->getZExtValue();
1781 if (i == Idx)
1782 isUnmovable[Idx] = true;
1783 }
1784 }
1785
1786 for (unsigned i = 0; i < 4; i++) {
1787 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1788 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1789 ->getZExtValue();
1790 if (isUnmovable[Idx])
1791 continue;
1792 // Swap i and Idx
1793 std::swap(NewBldVec[Idx], NewBldVec[i]);
1794 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1795 break;
1796 }
1797 }
1798
1799 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1800 VectorEntry.getValueType(), NewBldVec);
1801 }
1802
1803
OptimizeSwizzle(SDValue BuildVector,SDValue Swz[4],SelectionDAG & DAG,SDLoc DL) const1804 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1805 SDValue Swz[4], SelectionDAG &DAG,
1806 SDLoc DL) const {
1807 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1808 // Old -> New swizzle values
1809 DenseMap<unsigned, unsigned> SwizzleRemap;
1810
1811 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1812 for (unsigned i = 0; i < 4; i++) {
1813 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1814 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1815 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1816 }
1817
1818 SwizzleRemap.clear();
1819 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1820 for (unsigned i = 0; i < 4; i++) {
1821 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1822 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1823 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1824 }
1825
1826 return BuildVector;
1827 }
1828
1829
1830 //===----------------------------------------------------------------------===//
1831 // Custom DAG Optimizations
1832 //===----------------------------------------------------------------------===//
1833
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1834 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1835 DAGCombinerInfo &DCI) const {
1836 SelectionDAG &DAG = DCI.DAG;
1837
1838 switch (N->getOpcode()) {
1839 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1840 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1841 case ISD::FP_ROUND: {
1842 SDValue Arg = N->getOperand(0);
1843 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1844 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1845 Arg.getOperand(0));
1846 }
1847 break;
1848 }
1849
1850 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1851 // (i32 select_cc f32, f32, -1, 0 cc)
1852 //
1853 // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1854 // this to one of the SET*_DX10 instructions.
1855 case ISD::FP_TO_SINT: {
1856 SDValue FNeg = N->getOperand(0);
1857 if (FNeg.getOpcode() != ISD::FNEG) {
1858 return SDValue();
1859 }
1860 SDValue SelectCC = FNeg.getOperand(0);
1861 if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1862 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1863 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1864 !isHWTrueValue(SelectCC.getOperand(2)) ||
1865 !isHWFalseValue(SelectCC.getOperand(3))) {
1866 return SDValue();
1867 }
1868
1869 SDLoc dl(N);
1870 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1871 SelectCC.getOperand(0), // LHS
1872 SelectCC.getOperand(1), // RHS
1873 DAG.getConstant(-1, dl, MVT::i32), // True
1874 DAG.getConstant(0, dl, MVT::i32), // False
1875 SelectCC.getOperand(4)); // CC
1876
1877 break;
1878 }
1879
1880 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1881 // => build_vector elt0, ... , NewEltIdx, ... , eltN
1882 case ISD::INSERT_VECTOR_ELT: {
1883 SDValue InVec = N->getOperand(0);
1884 SDValue InVal = N->getOperand(1);
1885 SDValue EltNo = N->getOperand(2);
1886 SDLoc dl(N);
1887
1888 // If the inserted element is an UNDEF, just use the input vector.
1889 if (InVal.getOpcode() == ISD::UNDEF)
1890 return InVec;
1891
1892 EVT VT = InVec.getValueType();
1893
1894 // If we can't generate a legal BUILD_VECTOR, exit
1895 if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1896 return SDValue();
1897
1898 // Check that we know which element is being inserted
1899 if (!isa<ConstantSDNode>(EltNo))
1900 return SDValue();
1901 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1902
1903 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1904 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
1905 // vector elements.
1906 SmallVector<SDValue, 8> Ops;
1907 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1908 Ops.append(InVec.getNode()->op_begin(),
1909 InVec.getNode()->op_end());
1910 } else if (InVec.getOpcode() == ISD::UNDEF) {
1911 unsigned NElts = VT.getVectorNumElements();
1912 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1913 } else {
1914 return SDValue();
1915 }
1916
1917 // Insert the element
1918 if (Elt < Ops.size()) {
1919 // All the operands of BUILD_VECTOR must have the same type;
1920 // we enforce that here.
1921 EVT OpVT = Ops[0].getValueType();
1922 if (InVal.getValueType() != OpVT)
1923 InVal = OpVT.bitsGT(InVal.getValueType()) ?
1924 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1925 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1926 Ops[Elt] = InVal;
1927 }
1928
1929 // Return the new vector
1930 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1931 }
1932
1933 // Extract_vec (Build_vector) generated by custom lowering
1934 // also needs to be customly combined
1935 case ISD::EXTRACT_VECTOR_ELT: {
1936 SDValue Arg = N->getOperand(0);
1937 if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1938 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1939 unsigned Element = Const->getZExtValue();
1940 return Arg->getOperand(Element);
1941 }
1942 }
1943 if (Arg.getOpcode() == ISD::BITCAST &&
1944 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1945 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1946 unsigned Element = Const->getZExtValue();
1947 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1948 Arg->getOperand(0).getOperand(Element));
1949 }
1950 }
1951 break;
1952 }
1953
1954 case ISD::SELECT_CC: {
1955 // Try common optimizations
1956 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1957 if (Ret.getNode())
1958 return Ret;
1959
1960 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1961 // selectcc x, y, a, b, inv(cc)
1962 //
1963 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1964 // selectcc x, y, a, b, cc
1965 SDValue LHS = N->getOperand(0);
1966 if (LHS.getOpcode() != ISD::SELECT_CC) {
1967 return SDValue();
1968 }
1969
1970 SDValue RHS = N->getOperand(1);
1971 SDValue True = N->getOperand(2);
1972 SDValue False = N->getOperand(3);
1973 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1974
1975 if (LHS.getOperand(2).getNode() != True.getNode() ||
1976 LHS.getOperand(3).getNode() != False.getNode() ||
1977 RHS.getNode() != False.getNode()) {
1978 return SDValue();
1979 }
1980
1981 switch (NCC) {
1982 default: return SDValue();
1983 case ISD::SETNE: return LHS;
1984 case ISD::SETEQ: {
1985 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1986 LHSCC = ISD::getSetCCInverse(LHSCC,
1987 LHS.getOperand(0).getValueType().isInteger());
1988 if (DCI.isBeforeLegalizeOps() ||
1989 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1990 return DAG.getSelectCC(SDLoc(N),
1991 LHS.getOperand(0),
1992 LHS.getOperand(1),
1993 LHS.getOperand(2),
1994 LHS.getOperand(3),
1995 LHSCC);
1996 break;
1997 }
1998 }
1999 return SDValue();
2000 }
2001
2002 case AMDGPUISD::EXPORT: {
2003 SDValue Arg = N->getOperand(1);
2004 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2005 break;
2006
2007 SDValue NewArgs[8] = {
2008 N->getOperand(0), // Chain
2009 SDValue(),
2010 N->getOperand(2), // ArrayBase
2011 N->getOperand(3), // Type
2012 N->getOperand(4), // SWZ_X
2013 N->getOperand(5), // SWZ_Y
2014 N->getOperand(6), // SWZ_Z
2015 N->getOperand(7) // SWZ_W
2016 };
2017 SDLoc DL(N);
2018 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2019 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2020 }
2021 case AMDGPUISD::TEXTURE_FETCH: {
2022 SDValue Arg = N->getOperand(1);
2023 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2024 break;
2025
2026 SDValue NewArgs[19] = {
2027 N->getOperand(0),
2028 N->getOperand(1),
2029 N->getOperand(2),
2030 N->getOperand(3),
2031 N->getOperand(4),
2032 N->getOperand(5),
2033 N->getOperand(6),
2034 N->getOperand(7),
2035 N->getOperand(8),
2036 N->getOperand(9),
2037 N->getOperand(10),
2038 N->getOperand(11),
2039 N->getOperand(12),
2040 N->getOperand(13),
2041 N->getOperand(14),
2042 N->getOperand(15),
2043 N->getOperand(16),
2044 N->getOperand(17),
2045 N->getOperand(18),
2046 };
2047 SDLoc DL(N);
2048 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2049 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2050 }
2051 }
2052
2053 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2054 }
2055
2056 static bool
FoldOperand(SDNode * ParentNode,unsigned SrcIdx,SDValue & Src,SDValue & Neg,SDValue & Abs,SDValue & Sel,SDValue & Imm,SelectionDAG & DAG)2057 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2058 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2059 const R600InstrInfo *TII =
2060 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2061 if (!Src.isMachineOpcode())
2062 return false;
2063 switch (Src.getMachineOpcode()) {
2064 case AMDGPU::FNEG_R600:
2065 if (!Neg.getNode())
2066 return false;
2067 Src = Src.getOperand(0);
2068 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2069 return true;
2070 case AMDGPU::FABS_R600:
2071 if (!Abs.getNode())
2072 return false;
2073 Src = Src.getOperand(0);
2074 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2075 return true;
2076 case AMDGPU::CONST_COPY: {
2077 unsigned Opcode = ParentNode->getMachineOpcode();
2078 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2079
2080 if (!Sel.getNode())
2081 return false;
2082
2083 SDValue CstOffset = Src.getOperand(0);
2084 if (ParentNode->getValueType(0).isVector())
2085 return false;
2086
2087 // Gather constants values
2088 int SrcIndices[] = {
2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2096 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2097 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2098 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2099 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2100 };
2101 std::vector<unsigned> Consts;
2102 for (int OtherSrcIdx : SrcIndices) {
2103 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2104 if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2105 continue;
2106 if (HasDst) {
2107 OtherSrcIdx--;
2108 OtherSelIdx--;
2109 }
2110 if (RegisterSDNode *Reg =
2111 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2112 if (Reg->getReg() == AMDGPU::ALU_CONST) {
2113 ConstantSDNode *Cst
2114 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2115 Consts.push_back(Cst->getZExtValue());
2116 }
2117 }
2118 }
2119
2120 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2121 Consts.push_back(Cst->getZExtValue());
2122 if (!TII->fitsConstReadLimitations(Consts)) {
2123 return false;
2124 }
2125
2126 Sel = CstOffset;
2127 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2128 return true;
2129 }
2130 case AMDGPU::MOV_IMM_I32:
2131 case AMDGPU::MOV_IMM_F32: {
2132 unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2133 uint64_t ImmValue = 0;
2134
2135
2136 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2137 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2138 float FloatValue = FPC->getValueAPF().convertToFloat();
2139 if (FloatValue == 0.0) {
2140 ImmReg = AMDGPU::ZERO;
2141 } else if (FloatValue == 0.5) {
2142 ImmReg = AMDGPU::HALF;
2143 } else if (FloatValue == 1.0) {
2144 ImmReg = AMDGPU::ONE;
2145 } else {
2146 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2147 }
2148 } else {
2149 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2150 uint64_t Value = C->getZExtValue();
2151 if (Value == 0) {
2152 ImmReg = AMDGPU::ZERO;
2153 } else if (Value == 1) {
2154 ImmReg = AMDGPU::ONE_INT;
2155 } else {
2156 ImmValue = Value;
2157 }
2158 }
2159
2160 // Check that we aren't already using an immediate.
2161 // XXX: It's possible for an instruction to have more than one
2162 // immediate operand, but this is not supported yet.
2163 if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2164 if (!Imm.getNode())
2165 return false;
2166 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2167 assert(C);
2168 if (C->getZExtValue())
2169 return false;
2170 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2171 }
2172 Src = DAG.getRegister(ImmReg, MVT::i32);
2173 return true;
2174 }
2175 default:
2176 return false;
2177 }
2178 }
2179
2180
2181 /// \brief Fold the instructions after selecting them
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const2182 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2183 SelectionDAG &DAG) const {
2184 const R600InstrInfo *TII =
2185 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2186 if (!Node->isMachineOpcode())
2187 return Node;
2188 unsigned Opcode = Node->getMachineOpcode();
2189 SDValue FakeOp;
2190
2191 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2192
2193 if (Opcode == AMDGPU::DOT_4) {
2194 int OperandIdx[] = {
2195 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2196 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2201 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2202 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2203 };
2204 int NegIdx[] = {
2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2213 };
2214 int AbsIdx[] = {
2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2221 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2222 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2223 };
2224 for (unsigned i = 0; i < 8; i++) {
2225 if (OperandIdx[i] < 0)
2226 return Node;
2227 SDValue &Src = Ops[OperandIdx[i] - 1];
2228 SDValue &Neg = Ops[NegIdx[i] - 1];
2229 SDValue &Abs = Ops[AbsIdx[i] - 1];
2230 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2231 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2232 if (HasDst)
2233 SelIdx--;
2234 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2235 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2236 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2237 }
2238 } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2239 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2240 SDValue &Src = Ops[i];
2241 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2242 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2243 }
2244 } else if (Opcode == AMDGPU::CLAMP_R600) {
2245 SDValue Src = Node->getOperand(0);
2246 if (!Src.isMachineOpcode() ||
2247 !TII->hasInstrModifiers(Src.getMachineOpcode()))
2248 return Node;
2249 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2250 AMDGPU::OpName::clamp);
2251 if (ClampIdx < 0)
2252 return Node;
2253 SDLoc DL(Node);
2254 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2255 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2256 return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2257 Node->getVTList(), Ops);
2258 } else {
2259 if (!TII->hasInstrModifiers(Opcode))
2260 return Node;
2261 int OperandIdx[] = {
2262 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2263 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2264 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2265 };
2266 int NegIdx[] = {
2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2269 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2270 };
2271 int AbsIdx[] = {
2272 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2273 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2274 -1
2275 };
2276 for (unsigned i = 0; i < 3; i++) {
2277 if (OperandIdx[i] < 0)
2278 return Node;
2279 SDValue &Src = Ops[OperandIdx[i] - 1];
2280 SDValue &Neg = Ops[NegIdx[i] - 1];
2281 SDValue FakeAbs;
2282 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2283 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2286 if (HasDst) {
2287 SelIdx--;
2288 ImmIdx--;
2289 }
2290 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2291 SDValue &Imm = Ops[ImmIdx];
2292 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2293 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2294 }
2295 }
2296
2297 return Node;
2298 }
2299