1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass inserts branches on the 0 exec mask over divergent branches
11 /// branches when it's expected that jumping over the untaken control flow will
12 /// be cheaper than having every workitem no-op through it.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineOperand.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Target/TargetMachine.h"
38 #include <cassert>
39 #include <cstdint>
40 #include <iterator>
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "si-insert-skips"
45 
46 static cl::opt<unsigned> SkipThresholdFlag(
47   "amdgpu-skip-threshold-legacy",
48   cl::desc("Number of instructions before jumping over divergent control flow"),
49   cl::init(12), cl::Hidden);
50 
51 namespace {
52 
53 class SIInsertSkips : public MachineFunctionPass {
54 private:
55   const SIRegisterInfo *TRI = nullptr;
56   const SIInstrInfo *TII = nullptr;
57   unsigned SkipThreshold = 0;
58   MachineDominatorTree *MDT = nullptr;
59 
60   MachineBasicBlock *EarlyExitBlock = nullptr;
61   bool EarlyExitClearsExec = false;
62 
63   bool shouldSkip(const MachineBasicBlock &From,
64                   const MachineBasicBlock &To) const;
65 
66   bool dominatesAllReachable(MachineBasicBlock &MBB);
67   void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
68   void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
69                   DebugLoc DL);
70 
71   bool kill(MachineInstr &MI);
72 
73   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
74 
75 public:
76   static char ID;
77 
SIInsertSkips()78   SIInsertSkips() : MachineFunctionPass(ID) {}
79 
80   bool runOnMachineFunction(MachineFunction &MF) override;
81 
getPassName() const82   StringRef getPassName() const override {
83     return "SI insert s_cbranch_execz instructions";
84   }
85 
getAnalysisUsage(AnalysisUsage & AU) const86   void getAnalysisUsage(AnalysisUsage &AU) const override {
87     AU.addRequired<MachineDominatorTree>();
88     AU.addPreserved<MachineDominatorTree>();
89     MachineFunctionPass::getAnalysisUsage(AU);
90   }
91 };
92 
93 } // end anonymous namespace
94 
95 char SIInsertSkips::ID = 0;
96 
97 INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
98                       "SI insert s_cbranch_execz instructions", false, false)
99 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
100 INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
101                     "SI insert s_cbranch_execz instructions", false, false)
102 
103 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
104 
opcodeEmitsNoInsts(const MachineInstr & MI)105 static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
106   if (MI.isMetaInstruction())
107     return true;
108 
109   // Handle target specific opcodes.
110   switch (MI.getOpcode()) {
111   case AMDGPU::SI_MASK_BRANCH:
112     return true;
113   default:
114     return false;
115   }
116 }
117 
shouldSkip(const MachineBasicBlock & From,const MachineBasicBlock & To) const118 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
119                                const MachineBasicBlock &To) const {
120   unsigned NumInstr = 0;
121   const MachineFunction *MF = From.getParent();
122 
123   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
124        MBBI != End && MBBI != ToI; ++MBBI) {
125     const MachineBasicBlock &MBB = *MBBI;
126 
127     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
128          NumInstr < SkipThreshold && I != E; ++I) {
129       if (opcodeEmitsNoInsts(*I))
130         continue;
131 
132       // FIXME: Since this is required for correctness, this should be inserted
133       // during SILowerControlFlow.
134 
135       // When a uniform loop is inside non-uniform control flow, the branch
136       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
137       // when EXEC = 0. We should skip the loop lest it becomes infinite.
138       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
139           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
140         return true;
141 
142       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
143         return true;
144 
145       // These instructions are potentially expensive even if EXEC = 0.
146       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
147           I->getOpcode() == AMDGPU::S_WAITCNT)
148         return true;
149 
150       ++NumInstr;
151       if (NumInstr >= SkipThreshold)
152         return true;
153     }
154   }
155 
156   return false;
157 }
158 
159 /// Check whether \p MBB dominates all blocks that are reachable from it.
dominatesAllReachable(MachineBasicBlock & MBB)160 bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
161   for (MachineBasicBlock *Other : depth_first(&MBB)) {
162     if (!MDT->dominates(&MBB, Other))
163       return false;
164   }
165   return true;
166 }
167 
generatePsEndPgm(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,const SIInstrInfo * TII)168 static void generatePsEndPgm(MachineBasicBlock &MBB,
169                              MachineBasicBlock::iterator I, DebugLoc DL,
170                              const SIInstrInfo *TII) {
171   // Generate "null export; s_endpgm".
172   BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
173       .addImm(AMDGPU::Exp::ET_NULL)
174       .addReg(AMDGPU::VGPR0, RegState::Undef)
175       .addReg(AMDGPU::VGPR0, RegState::Undef)
176       .addReg(AMDGPU::VGPR0, RegState::Undef)
177       .addReg(AMDGPU::VGPR0, RegState::Undef)
178       .addImm(1)  // vm
179       .addImm(0)  // compr
180       .addImm(0); // en
181   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
182 }
183 
ensureEarlyExitBlock(MachineBasicBlock & MBB,bool ClearExec)184 void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
185                                          bool ClearExec) {
186   MachineFunction *MF = MBB.getParent();
187   DebugLoc DL;
188 
189   if (!EarlyExitBlock) {
190     EarlyExitBlock = MF->CreateMachineBasicBlock();
191     MF->insert(MF->end(), EarlyExitBlock);
192     generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
193     EarlyExitClearsExec = false;
194   }
195 
196   if (ClearExec && !EarlyExitClearsExec) {
197     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
198     unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
199     Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
200     auto ExitI = EarlyExitBlock->getFirstNonPHI();
201     assert(ExitI->getOpcode() == AMDGPU::EXP_DONE);
202     BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
203     EarlyExitClearsExec = true;
204   }
205 }
206 
splitBlock(MachineBasicBlock & MBB,MachineInstr & MI,MachineDominatorTree * MDT)207 static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
208                        MachineDominatorTree *MDT) {
209   MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
210 
211   // Update dominator tree
212   using DomTreeT = DomTreeBase<MachineBasicBlock>;
213   SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
214   for (MachineBasicBlock *Succ : SplitBB->successors()) {
215     DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
216     DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
217   }
218   DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
219   MDT->getBase().applyUpdates(DTUpdates);
220 }
221 
222 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
223 /// iterator. Only applies to pixel shaders.
skipIfDead(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL)224 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
225                                MachineBasicBlock::iterator I, DebugLoc DL) {
226   MachineFunction *MF = MBB.getParent();
227   (void)MF;
228   assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
229 
230   // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
231   // basic block that has no further successors (e.g., there was an
232   // `unreachable` there in IR). This can happen with original source of the
233   // form:
234   //
235   //   if (uniform_condition) {
236   //     write_to_memory();
237   //     discard;
238   //   }
239   //
240   // In this case, we write the "null_export; s_endpgm" skip code in the
241   // already-existing basic block.
242   auto NextBBI = std::next(MBB.getIterator());
243   bool NoSuccessor =
244       I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
245 
246   if (NoSuccessor) {
247     generatePsEndPgm(MBB, I, DL, TII);
248   } else {
249     ensureEarlyExitBlock(MBB, false);
250 
251     MachineInstr *BranchMI =
252         BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
253             .addMBB(EarlyExitBlock);
254 
255     // Split the block if the branch will not come at the end.
256     auto Next = std::next(BranchMI->getIterator());
257     if (Next != MBB.end() && !Next->isTerminator())
258       splitBlock(MBB, *BranchMI, MDT);
259 
260     MBB.addSuccessor(EarlyExitBlock);
261     MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
262   }
263 }
264 
265 /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
266 /// Return true unless the terminator is a no-op.
kill(MachineInstr & MI)267 bool SIInsertSkips::kill(MachineInstr &MI) {
268   MachineBasicBlock &MBB = *MI.getParent();
269   DebugLoc DL = MI.getDebugLoc();
270 
271   switch (MI.getOpcode()) {
272   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
273     unsigned Opcode = 0;
274 
275     // The opcodes are inverted because the inline immediate has to be
276     // the first operand, e.g. from "x < imm" to "imm > x"
277     switch (MI.getOperand(2).getImm()) {
278     case ISD::SETOEQ:
279     case ISD::SETEQ:
280       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
281       break;
282     case ISD::SETOGT:
283     case ISD::SETGT:
284       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
285       break;
286     case ISD::SETOGE:
287     case ISD::SETGE:
288       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
289       break;
290     case ISD::SETOLT:
291     case ISD::SETLT:
292       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
293       break;
294     case ISD::SETOLE:
295     case ISD::SETLE:
296       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
297       break;
298     case ISD::SETONE:
299     case ISD::SETNE:
300       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
301       break;
302     case ISD::SETO:
303       Opcode = AMDGPU::V_CMPX_O_F32_e64;
304       break;
305     case ISD::SETUO:
306       Opcode = AMDGPU::V_CMPX_U_F32_e64;
307       break;
308     case ISD::SETUEQ:
309       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
310       break;
311     case ISD::SETUGT:
312       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
313       break;
314     case ISD::SETUGE:
315       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
316       break;
317     case ISD::SETULT:
318       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
319       break;
320     case ISD::SETULE:
321       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
322       break;
323     case ISD::SETUNE:
324       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
325       break;
326     default:
327       llvm_unreachable("invalid ISD:SET cond code");
328     }
329 
330     const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
331     if (ST.hasNoSdstCMPX())
332       Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
333 
334     assert(MI.getOperand(0).isReg());
335 
336     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
337                     MI.getOperand(0).getReg())) {
338       Opcode = AMDGPU::getVOPe32(Opcode);
339       BuildMI(MBB, &MI, DL, TII->get(Opcode))
340           .add(MI.getOperand(1))
341           .add(MI.getOperand(0));
342     } else {
343       auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
344       if (!ST.hasNoSdstCMPX())
345         I.addReg(AMDGPU::VCC, RegState::Define);
346 
347       I.addImm(0)  // src0 modifiers
348         .add(MI.getOperand(1))
349         .addImm(0)  // src1 modifiers
350         .add(MI.getOperand(0));
351 
352       I.addImm(0);  // omod
353     }
354     return true;
355   }
356   case AMDGPU::SI_KILL_I1_TERMINATOR: {
357     const MachineFunction *MF = MI.getParent()->getParent();
358     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
359     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
360     const MachineOperand &Op = MI.getOperand(0);
361     int64_t KillVal = MI.getOperand(1).getImm();
362     assert(KillVal == 0 || KillVal == -1);
363 
364     // Kill all threads if Op0 is an immediate and equal to the Kill value.
365     if (Op.isImm()) {
366       int64_t Imm = Op.getImm();
367       assert(Imm == 0 || Imm == -1);
368 
369       if (Imm == KillVal) {
370         BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
371                                                      : AMDGPU::S_MOV_B64), Exec)
372           .addImm(0);
373         return true;
374       }
375       return false;
376     }
377 
378     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
379     if (ST.isWave32())
380       Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
381     BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
382         .addReg(Exec)
383         .add(Op);
384     return true;
385   }
386   default:
387     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
388   }
389 }
390 
391 // Returns true if a branch over the block was inserted.
skipMaskBranch(MachineInstr & MI,MachineBasicBlock & SrcMBB)392 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
393                                    MachineBasicBlock &SrcMBB) {
394   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
395 
396   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
397     return false;
398 
399   const DebugLoc &DL = MI.getDebugLoc();
400   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
401 
402   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
403     .addMBB(DestBB);
404 
405   return true;
406 }
407 
runOnMachineFunction(MachineFunction & MF)408 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
409   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
410   TII = ST.getInstrInfo();
411   TRI = &TII->getRegisterInfo();
412   MDT = &getAnalysis<MachineDominatorTree>();
413   SkipThreshold = SkipThresholdFlag;
414 
415   SmallVector<MachineInstr *, 4> KillInstrs;
416   bool MadeChange = false;
417 
418   for (MachineBasicBlock &MBB : MF) {
419     MachineBasicBlock::iterator I, Next;
420     for (I = MBB.begin(); I != MBB.end(); I = Next) {
421       Next = std::next(I);
422       MachineInstr &MI = *I;
423 
424       switch (MI.getOpcode()) {
425       case AMDGPU::SI_MASK_BRANCH:
426         MadeChange |= skipMaskBranch(MI, MBB);
427         break;
428 
429       case AMDGPU::S_BRANCH:
430         // Optimize out branches to the next block.
431         // FIXME: Shouldn't this be handled by BranchFolding?
432         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
433           assert(&MI == &MBB.back());
434           MI.eraseFromParent();
435           MadeChange = true;
436         }
437         break;
438 
439       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
440       case AMDGPU::SI_KILL_I1_TERMINATOR: {
441         MadeChange = true;
442         bool CanKill = kill(MI);
443 
444         // Check if we can add an early "if exec=0 { end shader }".
445         //
446         // Note that we _always_ do this if it is correct, even if the kill
447         // happens fairly late in the shader, because the null export should
448         // generally still be cheaper than normal export(s).
449         //
450         // TODO: The dominatesAllReachable check is conservative: if the
451         //       dominance is only missing due to _uniform_ branches, we could
452         //       in fact insert the early-exit as well.
453         if (CanKill &&
454             MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
455             dominatesAllReachable(MBB)) {
456           // Mark the instruction for kill-if-dead insertion. We delay this
457           // change because it modifies the CFG.
458           KillInstrs.push_back(&MI);
459         } else {
460           MI.eraseFromParent();
461         }
462         break;
463       }
464 
465       case AMDGPU::SI_KILL_CLEANUP:
466         if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
467             dominatesAllReachable(MBB)) {
468           KillInstrs.push_back(&MI);
469         } else {
470           MI.eraseFromParent();
471         }
472         break;
473 
474       default:
475         break;
476       }
477     }
478   }
479 
480   for (MachineInstr *Kill : KillInstrs) {
481     skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
482                Kill->getDebugLoc());
483     Kill->eraseFromParent();
484   }
485   KillInstrs.clear();
486   EarlyExitBlock = nullptr;
487 
488   return MadeChange;
489 }
490