1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "si-pre-emit-peephole"
25 
26 namespace {
27 
28 class SIPreEmitPeephole : public MachineFunctionPass {
29 private:
30   const SIInstrInfo *TII = nullptr;
31   const SIRegisterInfo *TRI = nullptr;
32 
33   bool optimizeVccBranch(MachineInstr &MI) const;
34   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
35 
36 public:
37   static char ID;
38 
SIPreEmitPeephole()39   SIPreEmitPeephole() : MachineFunctionPass(ID) {
40     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
41   }
42 
43   bool runOnMachineFunction(MachineFunction &MF) override;
44 };
45 
46 } // End anonymous namespace.
47 
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49                 "SI peephole optimizations", false, false)
50 
51 char SIPreEmitPeephole::ID = 0;
52 
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
54 
optimizeVccBranch(MachineInstr & MI) const55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
56   // Match:
57   // sreg = -1 or 0
58   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
59   // S_CBRANCH_VCC[N]Z
60   // =>
61   // S_CBRANCH_EXEC[N]Z
62   // We end up with this pattern sometimes after basic block placement.
63   // It happens while combining a block which assigns -1 or 0 to a saved mask
64   // and another block which consumes that saved mask and then a branch.
65   bool Changed = false;
66   MachineBasicBlock &MBB = *MI.getParent();
67   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68   const bool IsWave32 = ST.isWave32();
69   const unsigned CondReg = TRI->getVCC();
70   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
73   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
74 
75   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
76                                       E = MBB.rend();
77   bool ReadsCond = false;
78   unsigned Threshold = 5;
79   for (++A; A != E; ++A) {
80     if (!--Threshold)
81       return false;
82     if (A->modifiesRegister(ExecReg, TRI))
83       return false;
84     if (A->modifiesRegister(CondReg, TRI)) {
85       if (!A->definesRegister(CondReg, TRI) ||
86           (A->getOpcode() != And && A->getOpcode() != AndN2))
87         return false;
88       break;
89     }
90     ReadsCond |= A->readsRegister(CondReg, TRI);
91   }
92   if (A == E)
93     return false;
94 
95   MachineOperand &Op1 = A->getOperand(1);
96   MachineOperand &Op2 = A->getOperand(2);
97   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
98     TII->commuteInstruction(*A);
99     Changed = true;
100   }
101   if (Op1.getReg() != ExecReg)
102     return Changed;
103   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
104     return Changed;
105 
106   int64_t MaskValue = 0;
107   Register SReg;
108   if (Op2.isReg()) {
109     SReg = Op2.getReg();
110     auto M = std::next(A);
111     bool ReadsSreg = false;
112     for (; M != E; ++M) {
113       if (M->definesRegister(SReg, TRI))
114         break;
115       if (M->modifiesRegister(SReg, TRI))
116         return Changed;
117       ReadsSreg |= M->readsRegister(SReg, TRI);
118     }
119     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
120         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
121       return Changed;
122     MaskValue = M->getOperand(1).getImm();
123     // First if sreg is only used in the AND instruction fold the immediate
124     // into into the AND.
125     if (!ReadsSreg && Op2.isKill()) {
126       A->getOperand(2).ChangeToImmediate(MaskValue);
127       M->eraseFromParent();
128     }
129   } else if (Op2.isImm()) {
130     MaskValue = Op2.getImm();
131   } else {
132     llvm_unreachable("Op2 must be register or immediate");
133   }
134 
135   // Invert mask for s_andn2
136   assert(MaskValue == 0 || MaskValue == -1);
137   if (A->getOpcode() == AndN2)
138     MaskValue = ~MaskValue;
139 
140   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
141     if (!MI.killsRegister(CondReg, TRI)) {
142       // Replace AND with MOV
143       if (MaskValue == 0) {
144         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
145             .addImm(0);
146       } else {
147         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
148             .addReg(ExecReg);
149       }
150     }
151     // Remove AND instruction
152     A->eraseFromParent();
153   }
154 
155   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
156   if (SReg == ExecReg) {
157     // EXEC is updated directly
158     if (IsVCCZ) {
159       MI.eraseFromParent();
160       return true;
161     }
162     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
163   } else if (IsVCCZ && MaskValue == 0) {
164     // Will always branch
165     // Remove all succesors shadowed by new unconditional branch
166     MachineBasicBlock *Parent = MI.getParent();
167     SmallVector<MachineInstr *, 4> ToRemove;
168     bool Found = false;
169     for (MachineInstr &Term : Parent->terminators()) {
170       if (Found) {
171         if (Term.isBranch())
172           ToRemove.push_back(&Term);
173       } else {
174         Found = Term.isIdenticalTo(MI);
175       }
176     }
177     assert(Found && "conditional branch is not terminator");
178     for (auto BranchMI : ToRemove) {
179       MachineOperand &Dst = BranchMI->getOperand(0);
180       assert(Dst.isMBB() && "destination is not basic block");
181       Parent->removeSuccessor(Dst.getMBB());
182       BranchMI->eraseFromParent();
183     }
184 
185     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
186       Parent->removeSuccessor(Succ);
187     }
188 
189     // Rewrite to unconditional branch
190     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
191   } else if (!IsVCCZ && MaskValue == 0) {
192     // Will never branch
193     MachineOperand &Dst = MI.getOperand(0);
194     assert(Dst.isMBB() && "destination is not basic block");
195     MI.getParent()->removeSuccessor(Dst.getMBB());
196     MI.eraseFromParent();
197     return true;
198   } else if (MaskValue == -1) {
199     // Depends only on EXEC
200     MI.setDesc(
201         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
202   }
203 
204   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
205   MI.addImplicitDefUseOperands(*MBB.getParent());
206 
207   return true;
208 }
209 
optimizeSetGPR(MachineInstr & First,MachineInstr & MI) const210 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
211                                        MachineInstr &MI) const {
212   MachineBasicBlock &MBB = *MI.getParent();
213   const MachineFunction &MF = *MBB.getParent();
214   const MachineRegisterInfo &MRI = MF.getRegInfo();
215   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
216   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
217   SmallVector<MachineInstr *, 4> ToRemove;
218   bool IdxOn = true;
219 
220   if (!MI.isIdenticalTo(First))
221     return false;
222 
223   // Scan back to find an identical S_SET_GPR_IDX_ON
224   for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
225        E = MI.getIterator(); I != E; ++I) {
226     switch (I->getOpcode()) {
227     case AMDGPU::S_SET_GPR_IDX_MODE:
228       return false;
229     case AMDGPU::S_SET_GPR_IDX_OFF:
230       IdxOn = false;
231       ToRemove.push_back(&*I);
232       break;
233     default:
234       if (I->modifiesRegister(AMDGPU::M0, TRI))
235         return false;
236       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
237         return false;
238       if (llvm::any_of(I->operands(),
239                        [&MRI, this](const MachineOperand &MO) {
240                          return MO.isReg() &&
241                                 TRI->isVectorRegister(MRI, MO.getReg());
242                        })) {
243         // The only exception allowed here is another indirect vector move
244         // with the same mode.
245         if (!IdxOn ||
246             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
247                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
248               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
249           return false;
250       }
251     }
252   }
253 
254   MI.eraseFromParent();
255   for (MachineInstr *RI : ToRemove)
256     RI->eraseFromParent();
257   return true;
258 }
259 
runOnMachineFunction(MachineFunction & MF)260 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
261   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
262   TII = ST.getInstrInfo();
263   TRI = &TII->getRegisterInfo();
264   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
265   bool Changed = false;
266 
267   for (MachineBasicBlock &MBB : MF) {
268     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
269     MachineBasicBlock::iterator TermI = MBBE;
270     // Check first terminator for VCC branches to optimize
271     if (TermI != MBB.end()) {
272       MachineInstr &MI = *TermI;
273       switch (MI.getOpcode()) {
274       case AMDGPU::S_CBRANCH_VCCZ:
275       case AMDGPU::S_CBRANCH_VCCNZ:
276         Changed |= optimizeVccBranch(MI);
277         continue;
278       default:
279         break;
280       }
281     }
282     // Check all terminators for SI_RETURN_TO_EPILOG
283     // FIXME: This is not an optimization and should be moved somewhere else.
284     while (TermI != MBB.end()) {
285       MachineInstr &MI = *TermI;
286       if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
287         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
288 
289         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
290         // because external bytecode will be appended at the end.
291         if (&MBB != &MF.back() || &MI != &MBB.back()) {
292           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
293           // at the end and jump there.
294           if (!EmptyMBBAtEnd) {
295             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
296             MF.insert(MF.end(), EmptyMBBAtEnd);
297           }
298 
299           MBB.addSuccessor(EmptyMBBAtEnd);
300           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
301               .addMBB(EmptyMBBAtEnd);
302           MI.eraseFromParent();
303           MBBE = MBB.getFirstTerminator();
304           TermI = MBBE;
305           continue;
306         }
307       }
308       TermI++;
309     }
310 
311     if (!ST.hasVGPRIndexMode())
312       continue;
313 
314     MachineInstr *SetGPRMI = nullptr;
315     const unsigned Threshold = 20;
316     unsigned Count = 0;
317     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
318     // second is not needed. Do expensive checks in the optimizeSetGPR()
319     // and limit the distance to 20 instructions for compile time purposes.
320     for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
321       MachineInstr &MI = *MBBI;
322       ++MBBI;
323 
324       if (Count == Threshold)
325         SetGPRMI = nullptr;
326       else
327         ++Count;
328 
329       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
330         continue;
331 
332       Count = 0;
333       if (!SetGPRMI) {
334         SetGPRMI = &MI;
335         continue;
336       }
337 
338       if (optimizeSetGPR(*SetGPRMI, MI))
339         Changed = true;
340       else
341         SetGPRMI = &MI;
342     }
343   }
344 
345   return Changed;
346 }
347