1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass adds instructions to enable whole quad mode for pixel
12 /// shaders.
13 ///
14 /// Whole quad mode is required for derivative computations, but it interferes
15 /// with shader side effects (stores and atomics). This pass is run on the
16 /// scheduled machine IR but before register coalescing, so that machine SSA is
17 /// available for analysis. It ensures that WQM is enabled when necessary, but
18 /// disabled around stores and atomics.
19 ///
20 /// When necessary, this pass creates a function prolog
21 ///
22 /// S_MOV_B64 LiveMask, EXEC
23 /// S_WQM_B64 EXEC, EXEC
24 ///
25 /// to enter WQM at the top of the function and surrounds blocks of Exact
26 /// instructions by
27 ///
28 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29 /// ...
30 /// S_MOV_B64 EXEC, Tmp
31 ///
32 /// In order to avoid excessive switching during sequences of Exact
33 /// instructions, the pass first analyzes which instructions must be run in WQM
34 /// (aka which instructions produce values that lead to derivative
35 /// computations).
36 ///
37 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
38 ///
39 /// There is room for improvement given better control flow analysis:
40 ///
41 /// (1) at the top level (outside of control flow statements, and as long as
42 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
43 /// the LiveMask (this is implemented for the entry block).
44 ///
45 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
46 /// consist of exact and don't-care instructions, the switch only has to
47 /// be done at the entry and exit points rather than potentially in each
48 /// block of the region.
49 ///
50 //===----------------------------------------------------------------------===//
51
52 #include "AMDGPU.h"
53 #include "AMDGPUSubtarget.h"
54 #include "SIInstrInfo.h"
55 #include "SIMachineFunctionInfo.h"
56 #include "llvm/CodeGen/MachineFunction.h"
57 #include "llvm/CodeGen/MachineFunctionPass.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineRegisterInfo.h"
60
61 using namespace llvm;
62
63 #define DEBUG_TYPE "si-wqm"
64
65 namespace {
66
67 enum {
68 StateWQM = 0x1,
69 StateExact = 0x2,
70 };
71
72 struct InstrInfo {
73 char Needs = 0;
74 char OutNeeds = 0;
75 };
76
77 struct BlockInfo {
78 char Needs = 0;
79 char InNeeds = 0;
80 char OutNeeds = 0;
81 };
82
83 struct WorkItem {
84 MachineBasicBlock *MBB = nullptr;
85 MachineInstr *MI = nullptr;
86
WorkItem__anonab94eca00111::WorkItem87 WorkItem() {}
WorkItem__anonab94eca00111::WorkItem88 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anonab94eca00111::WorkItem89 WorkItem(MachineInstr *MI) : MI(MI) {}
90 };
91
92 class SIWholeQuadMode : public MachineFunctionPass {
93 private:
94 const SIInstrInfo *TII;
95 const SIRegisterInfo *TRI;
96 MachineRegisterInfo *MRI;
97
98 DenseMap<const MachineInstr *, InstrInfo> Instructions;
99 DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
100 SmallVector<const MachineInstr *, 2> ExecExports;
101 SmallVector<MachineInstr *, 1> LiveMaskQueries;
102
103 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
104 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
105 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
106 char analyzeFunction(MachineFunction &MF);
107
108 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
109 unsigned SaveWQM, unsigned LiveMaskReg);
110 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
111 unsigned SavedWQM);
112 void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
113
114 void lowerLiveMaskQueries(unsigned LiveMaskReg);
115
116 public:
117 static char ID;
118
SIWholeQuadMode()119 SIWholeQuadMode() :
120 MachineFunctionPass(ID) { }
121
122 bool runOnMachineFunction(MachineFunction &MF) override;
123
getPassName() const124 const char *getPassName() const override {
125 return "SI Whole Quad Mode";
126 }
127
getAnalysisUsage(AnalysisUsage & AU) const128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.setPreservesCFG();
130 MachineFunctionPass::getAnalysisUsage(AU);
131 }
132 };
133
134 } // End anonymous namespace
135
136 char SIWholeQuadMode::ID = 0;
137
138 INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
139 "SI Whole Quad Mode", false, false)
140
141 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
142
createSIWholeQuadModePass()143 FunctionPass *llvm::createSIWholeQuadModePass() {
144 return new SIWholeQuadMode;
145 }
146
147 // Scan instructions to determine which ones require an Exact execmask and
148 // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)149 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
150 std::vector<WorkItem> &Worklist) {
151 char GlobalFlags = 0;
152 bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
153
154 for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
155 MachineBasicBlock &MBB = *BI;
156
157 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
158 MachineInstr &MI = *II;
159 unsigned Opcode = MI.getOpcode();
160 char Flags = 0;
161
162 if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
163 Flags = StateWQM;
164 } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
165 Flags = StateExact;
166 } else {
167 // Handle export instructions with the exec mask valid flag set
168 if (Opcode == AMDGPU::EXP) {
169 if (MI.getOperand(4).getImm() != 0)
170 ExecExports.push_back(&MI);
171 } else if (Opcode == AMDGPU::SI_PS_LIVE) {
172 LiveMaskQueries.push_back(&MI);
173 } else if (WQMOutputs) {
174 // The function is in machine SSA form, which means that physical
175 // VGPRs correspond to shader inputs and outputs. Inputs are
176 // only used, outputs are only defined.
177 for (const MachineOperand &MO : MI.defs()) {
178 if (!MO.isReg())
179 continue;
180
181 unsigned Reg = MO.getReg();
182
183 if (!TRI->isVirtualRegister(Reg) &&
184 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
185 Flags = StateWQM;
186 break;
187 }
188 }
189 }
190
191 if (!Flags)
192 continue;
193 }
194
195 Instructions[&MI].Needs = Flags;
196 Worklist.push_back(&MI);
197 GlobalFlags |= Flags;
198 }
199
200 if (WQMOutputs && MBB.succ_empty()) {
201 // This is a prolog shader. Make sure we go back to exact mode at the end.
202 Blocks[&MBB].OutNeeds = StateExact;
203 Worklist.push_back(&MBB);
204 GlobalFlags |= StateExact;
205 }
206 }
207
208 return GlobalFlags;
209 }
210
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)211 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
212 std::vector<WorkItem>& Worklist) {
213 MachineBasicBlock *MBB = MI.getParent();
214 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
215 BlockInfo &BI = Blocks[MBB];
216
217 // Control flow-type instructions that are followed by WQM computations
218 // must themselves be in WQM.
219 if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
220 Instructions[&MI].Needs = StateWQM;
221 II.Needs = StateWQM;
222 }
223
224 // Propagate to block level
225 BI.Needs |= II.Needs;
226 if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
227 BI.InNeeds |= II.Needs;
228 Worklist.push_back(MBB);
229 }
230
231 // Propagate backwards within block
232 if (MachineInstr *PrevMI = MI.getPrevNode()) {
233 char InNeeds = II.Needs | II.OutNeeds;
234 if (!PrevMI->isPHI()) {
235 InstrInfo &PrevII = Instructions[PrevMI];
236 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
237 PrevII.OutNeeds |= InNeeds;
238 Worklist.push_back(PrevMI);
239 }
240 }
241 }
242
243 // Propagate WQM flag to instruction inputs
244 assert(II.Needs != (StateWQM | StateExact));
245 if (II.Needs != StateWQM)
246 return;
247
248 for (const MachineOperand &Use : MI.uses()) {
249 if (!Use.isReg() || !Use.isUse())
250 continue;
251
252 // At this point, physical registers appear as inputs or outputs
253 // and following them makes no sense (and would in fact be incorrect
254 // when the same VGPR is used as both an output and an input that leads
255 // to a NeedsWQM instruction).
256 //
257 // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
258 // have to trace this, in practice it happens for 64-bit computations like
259 // pointers where both dwords are followed already anyway.
260 if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
261 continue;
262
263 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
264 InstrInfo &DefII = Instructions[&DefMI];
265
266 // Obviously skip if DefMI is already flagged as NeedWQM.
267 //
268 // The instruction might also be flagged as NeedExact. This happens when
269 // the result of an atomic is used in a WQM computation. In this case,
270 // the atomic must not run for helper pixels and the WQM result is
271 // undefined.
272 if (DefII.Needs != 0)
273 continue;
274
275 DefII.Needs = StateWQM;
276 Worklist.push_back(&DefMI);
277 }
278 }
279 }
280
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)281 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
282 std::vector<WorkItem>& Worklist) {
283 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
284
285 // Propagate through instructions
286 if (!MBB.empty()) {
287 MachineInstr *LastMI = &*MBB.rbegin();
288 InstrInfo &LastII = Instructions[LastMI];
289 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
290 LastII.OutNeeds |= BI.OutNeeds;
291 Worklist.push_back(LastMI);
292 }
293 }
294
295 // Predecessor blocks must provide for our WQM/Exact needs.
296 for (MachineBasicBlock *Pred : MBB.predecessors()) {
297 BlockInfo &PredBI = Blocks[Pred];
298 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
299 continue;
300
301 PredBI.OutNeeds |= BI.InNeeds;
302 PredBI.InNeeds |= BI.InNeeds;
303 Worklist.push_back(Pred);
304 }
305
306 // All successors must be prepared to accept the same set of WQM/Exact data.
307 for (MachineBasicBlock *Succ : MBB.successors()) {
308 BlockInfo &SuccBI = Blocks[Succ];
309 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
310 continue;
311
312 SuccBI.InNeeds |= BI.OutNeeds;
313 Worklist.push_back(Succ);
314 }
315 }
316
analyzeFunction(MachineFunction & MF)317 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
318 std::vector<WorkItem> Worklist;
319 char GlobalFlags = scanInstructions(MF, Worklist);
320
321 while (!Worklist.empty()) {
322 WorkItem WI = Worklist.back();
323 Worklist.pop_back();
324
325 if (WI.MI)
326 propagateInstruction(*WI.MI, Worklist);
327 else
328 propagateBlock(*WI.MBB, Worklist);
329 }
330
331 return GlobalFlags;
332 }
333
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,unsigned SaveWQM,unsigned LiveMaskReg)334 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
335 MachineBasicBlock::iterator Before,
336 unsigned SaveWQM, unsigned LiveMaskReg) {
337 if (SaveWQM) {
338 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
339 SaveWQM)
340 .addReg(LiveMaskReg);
341 } else {
342 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
343 AMDGPU::EXEC)
344 .addReg(AMDGPU::EXEC)
345 .addReg(LiveMaskReg);
346 }
347 }
348
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,unsigned SavedWQM)349 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
350 MachineBasicBlock::iterator Before,
351 unsigned SavedWQM) {
352 if (SavedWQM) {
353 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
354 .addReg(SavedWQM);
355 } else {
356 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
357 AMDGPU::EXEC)
358 .addReg(AMDGPU::EXEC);
359 }
360 }
361
processBlock(MachineBasicBlock & MBB,unsigned LiveMaskReg,bool isEntry)362 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
363 bool isEntry) {
364 auto BII = Blocks.find(&MBB);
365 if (BII == Blocks.end())
366 return;
367
368 const BlockInfo &BI = BII->second;
369
370 if (!(BI.InNeeds & StateWQM))
371 return;
372
373 // This is a non-entry block that is WQM throughout, so no need to do
374 // anything.
375 if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
376 return;
377
378 unsigned SavedWQMReg = 0;
379 bool WQMFromExec = isEntry;
380 char State = isEntry ? StateExact : StateWQM;
381
382 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
383 while (II != IE) {
384 MachineInstr &MI = *II;
385 ++II;
386
387 // Skip instructions that are not affected by EXEC
388 if (TII->isScalarUnit(MI) && !MI.isTerminator())
389 continue;
390
391 // Generic instructions such as COPY will either disappear by register
392 // coalescing or be lowered to SALU or VALU instructions.
393 if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
394 if (MI.getNumExplicitOperands() >= 1) {
395 const MachineOperand &Op = MI.getOperand(0);
396 if (Op.isReg()) {
397 if (TRI->isSGPRReg(*MRI, Op.getReg())) {
398 // SGPR instructions are not affected by EXEC
399 continue;
400 }
401 }
402 }
403 }
404
405 char Needs = 0;
406 char OutNeeds = 0;
407 auto InstrInfoIt = Instructions.find(&MI);
408 if (InstrInfoIt != Instructions.end()) {
409 Needs = InstrInfoIt->second.Needs;
410 OutNeeds = InstrInfoIt->second.OutNeeds;
411
412 // Make sure to switch to Exact mode before the end of the block when
413 // Exact and only Exact is needed further downstream.
414 if (OutNeeds == StateExact && MI.isTerminator()) {
415 assert(Needs == 0);
416 Needs = StateExact;
417 }
418 }
419
420 // State switching
421 if (Needs && State != Needs) {
422 if (Needs == StateExact) {
423 assert(!SavedWQMReg);
424
425 if (!WQMFromExec && (OutNeeds & StateWQM))
426 SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
427
428 toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
429 } else {
430 assert(WQMFromExec == (SavedWQMReg == 0));
431 toWQM(MBB, &MI, SavedWQMReg);
432 SavedWQMReg = 0;
433 }
434
435 State = Needs;
436 }
437 }
438
439 if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
440 assert(WQMFromExec == (SavedWQMReg == 0));
441 toWQM(MBB, MBB.end(), SavedWQMReg);
442 } else if (BI.OutNeeds == StateExact && State != StateExact) {
443 toExact(MBB, MBB.end(), 0, LiveMaskReg);
444 }
445 }
446
lowerLiveMaskQueries(unsigned LiveMaskReg)447 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
448 for (MachineInstr *MI : LiveMaskQueries) {
449 const DebugLoc &DL = MI->getDebugLoc();
450 unsigned Dest = MI->getOperand(0).getReg();
451 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
452 .addReg(LiveMaskReg);
453 MI->eraseFromParent();
454 }
455 }
456
runOnMachineFunction(MachineFunction & MF)457 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
458 if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
459 return false;
460
461 Instructions.clear();
462 Blocks.clear();
463 ExecExports.clear();
464 LiveMaskQueries.clear();
465
466 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
467
468 TII = ST.getInstrInfo();
469 TRI = &TII->getRegisterInfo();
470 MRI = &MF.getRegInfo();
471
472 char GlobalFlags = analyzeFunction(MF);
473 if (!(GlobalFlags & StateWQM)) {
474 lowerLiveMaskQueries(AMDGPU::EXEC);
475 return !LiveMaskQueries.empty();
476 }
477
478 // Store a copy of the original live mask when required
479 unsigned LiveMaskReg = 0;
480 {
481 MachineBasicBlock &Entry = MF.front();
482 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
483
484 if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
485 LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
486 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
487 .addReg(AMDGPU::EXEC);
488 }
489
490 if (GlobalFlags == StateWQM) {
491 // For a shader that needs only WQM, we can just set it once.
492 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
493 AMDGPU::EXEC)
494 .addReg(AMDGPU::EXEC);
495
496 lowerLiveMaskQueries(LiveMaskReg);
497 // EntryMI may become invalid here
498 return true;
499 }
500 }
501
502 lowerLiveMaskQueries(LiveMaskReg);
503
504 // Handle the general case
505 for (auto BII : Blocks)
506 processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
507
508 return true;
509 }
510