1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
21
22 using namespace llvm;
23
24 #define DEBUG_TYPE "si-pre-emit-peephole"
25
26 namespace {
27
28 class SIPreEmitPeephole : public MachineFunctionPass {
29 private:
30 const SIInstrInfo *TII = nullptr;
31 const SIRegisterInfo *TRI = nullptr;
32
33 bool optimizeVccBranch(MachineInstr &MI) const;
34 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
35
36 public:
37 static char ID;
38
SIPreEmitPeephole()39 SIPreEmitPeephole() : MachineFunctionPass(ID) {
40 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
41 }
42
43 bool runOnMachineFunction(MachineFunction &MF) override;
44 };
45
46 } // End anonymous namespace.
47
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49 "SI peephole optimizations", false, false)
50
51 char SIPreEmitPeephole::ID = 0;
52
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
54
optimizeVccBranch(MachineInstr & MI) const55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
56 // Match:
57 // sreg = -1 or 0
58 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
59 // S_CBRANCH_VCC[N]Z
60 // =>
61 // S_CBRANCH_EXEC[N]Z
62 // We end up with this pattern sometimes after basic block placement.
63 // It happens while combining a block which assigns -1 or 0 to a saved mask
64 // and another block which consumes that saved mask and then a branch.
65 bool Changed = false;
66 MachineBasicBlock &MBB = *MI.getParent();
67 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68 const bool IsWave32 = ST.isWave32();
69 const unsigned CondReg = TRI->getVCC();
70 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
73 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
74
75 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
76 E = MBB.rend();
77 bool ReadsCond = false;
78 unsigned Threshold = 5;
79 for (++A; A != E; ++A) {
80 if (!--Threshold)
81 return false;
82 if (A->modifiesRegister(ExecReg, TRI))
83 return false;
84 if (A->modifiesRegister(CondReg, TRI)) {
85 if (!A->definesRegister(CondReg, TRI) ||
86 (A->getOpcode() != And && A->getOpcode() != AndN2))
87 return false;
88 break;
89 }
90 ReadsCond |= A->readsRegister(CondReg, TRI);
91 }
92 if (A == E)
93 return false;
94
95 MachineOperand &Op1 = A->getOperand(1);
96 MachineOperand &Op2 = A->getOperand(2);
97 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
98 TII->commuteInstruction(*A);
99 Changed = true;
100 }
101 if (Op1.getReg() != ExecReg)
102 return Changed;
103 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
104 return Changed;
105
106 int64_t MaskValue = 0;
107 Register SReg;
108 if (Op2.isReg()) {
109 SReg = Op2.getReg();
110 auto M = std::next(A);
111 bool ReadsSreg = false;
112 for (; M != E; ++M) {
113 if (M->definesRegister(SReg, TRI))
114 break;
115 if (M->modifiesRegister(SReg, TRI))
116 return Changed;
117 ReadsSreg |= M->readsRegister(SReg, TRI);
118 }
119 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
120 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
121 return Changed;
122 MaskValue = M->getOperand(1).getImm();
123 // First if sreg is only used in the AND instruction fold the immediate
124 // into into the AND.
125 if (!ReadsSreg && Op2.isKill()) {
126 A->getOperand(2).ChangeToImmediate(MaskValue);
127 M->eraseFromParent();
128 }
129 } else if (Op2.isImm()) {
130 MaskValue = Op2.getImm();
131 } else {
132 llvm_unreachable("Op2 must be register or immediate");
133 }
134
135 // Invert mask for s_andn2
136 assert(MaskValue == 0 || MaskValue == -1);
137 if (A->getOpcode() == AndN2)
138 MaskValue = ~MaskValue;
139
140 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
141 if (!MI.killsRegister(CondReg, TRI)) {
142 // Replace AND with MOV
143 if (MaskValue == 0) {
144 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
145 .addImm(0);
146 } else {
147 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
148 .addReg(ExecReg);
149 }
150 }
151 // Remove AND instruction
152 A->eraseFromParent();
153 }
154
155 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
156 if (SReg == ExecReg) {
157 // EXEC is updated directly
158 if (IsVCCZ) {
159 MI.eraseFromParent();
160 return true;
161 }
162 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
163 } else if (IsVCCZ && MaskValue == 0) {
164 // Will always branch
165 // Remove all succesors shadowed by new unconditional branch
166 MachineBasicBlock *Parent = MI.getParent();
167 SmallVector<MachineInstr *, 4> ToRemove;
168 bool Found = false;
169 for (MachineInstr &Term : Parent->terminators()) {
170 if (Found) {
171 if (Term.isBranch())
172 ToRemove.push_back(&Term);
173 } else {
174 Found = Term.isIdenticalTo(MI);
175 }
176 }
177 assert(Found && "conditional branch is not terminator");
178 for (auto BranchMI : ToRemove) {
179 MachineOperand &Dst = BranchMI->getOperand(0);
180 assert(Dst.isMBB() && "destination is not basic block");
181 Parent->removeSuccessor(Dst.getMBB());
182 BranchMI->eraseFromParent();
183 }
184
185 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
186 Parent->removeSuccessor(Succ);
187 }
188
189 // Rewrite to unconditional branch
190 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
191 } else if (!IsVCCZ && MaskValue == 0) {
192 // Will never branch
193 MachineOperand &Dst = MI.getOperand(0);
194 assert(Dst.isMBB() && "destination is not basic block");
195 MI.getParent()->removeSuccessor(Dst.getMBB());
196 MI.eraseFromParent();
197 return true;
198 } else if (MaskValue == -1) {
199 // Depends only on EXEC
200 MI.setDesc(
201 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
202 }
203
204 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
205 MI.addImplicitDefUseOperands(*MBB.getParent());
206
207 return true;
208 }
209
optimizeSetGPR(MachineInstr & First,MachineInstr & MI) const210 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
211 MachineInstr &MI) const {
212 MachineBasicBlock &MBB = *MI.getParent();
213 const MachineFunction &MF = *MBB.getParent();
214 const MachineRegisterInfo &MRI = MF.getRegInfo();
215 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
216 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
217 SmallVector<MachineInstr *, 4> ToRemove;
218 bool IdxOn = true;
219
220 if (!MI.isIdenticalTo(First))
221 return false;
222
223 // Scan back to find an identical S_SET_GPR_IDX_ON
224 for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
225 E = MI.getIterator(); I != E; ++I) {
226 switch (I->getOpcode()) {
227 case AMDGPU::S_SET_GPR_IDX_MODE:
228 return false;
229 case AMDGPU::S_SET_GPR_IDX_OFF:
230 IdxOn = false;
231 ToRemove.push_back(&*I);
232 break;
233 default:
234 if (I->modifiesRegister(AMDGPU::M0, TRI))
235 return false;
236 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
237 return false;
238 if (llvm::any_of(I->operands(),
239 [&MRI, this](const MachineOperand &MO) {
240 return MO.isReg() &&
241 TRI->isVectorRegister(MRI, MO.getReg());
242 })) {
243 // The only exception allowed here is another indirect vector move
244 // with the same mode.
245 if (!IdxOn ||
246 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
247 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
248 I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
249 return false;
250 }
251 }
252 }
253
254 MI.eraseFromParent();
255 for (MachineInstr *RI : ToRemove)
256 RI->eraseFromParent();
257 return true;
258 }
259
runOnMachineFunction(MachineFunction & MF)260 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
261 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
262 TII = ST.getInstrInfo();
263 TRI = &TII->getRegisterInfo();
264 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
265 bool Changed = false;
266
267 for (MachineBasicBlock &MBB : MF) {
268 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
269 MachineBasicBlock::iterator TermI = MBBE;
270 // Check first terminator for VCC branches to optimize
271 if (TermI != MBB.end()) {
272 MachineInstr &MI = *TermI;
273 switch (MI.getOpcode()) {
274 case AMDGPU::S_CBRANCH_VCCZ:
275 case AMDGPU::S_CBRANCH_VCCNZ:
276 Changed |= optimizeVccBranch(MI);
277 continue;
278 default:
279 break;
280 }
281 }
282 // Check all terminators for SI_RETURN_TO_EPILOG
283 // FIXME: This is not an optimization and should be moved somewhere else.
284 while (TermI != MBB.end()) {
285 MachineInstr &MI = *TermI;
286 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
287 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
288
289 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
290 // because external bytecode will be appended at the end.
291 if (&MBB != &MF.back() || &MI != &MBB.back()) {
292 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
293 // at the end and jump there.
294 if (!EmptyMBBAtEnd) {
295 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
296 MF.insert(MF.end(), EmptyMBBAtEnd);
297 }
298
299 MBB.addSuccessor(EmptyMBBAtEnd);
300 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
301 .addMBB(EmptyMBBAtEnd);
302 MI.eraseFromParent();
303 MBBE = MBB.getFirstTerminator();
304 TermI = MBBE;
305 continue;
306 }
307 }
308 TermI++;
309 }
310
311 if (!ST.hasVGPRIndexMode())
312 continue;
313
314 MachineInstr *SetGPRMI = nullptr;
315 const unsigned Threshold = 20;
316 unsigned Count = 0;
317 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
318 // second is not needed. Do expensive checks in the optimizeSetGPR()
319 // and limit the distance to 20 instructions for compile time purposes.
320 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
321 MachineInstr &MI = *MBBI;
322 ++MBBI;
323
324 if (Count == Threshold)
325 SetGPRMI = nullptr;
326 else
327 ++Count;
328
329 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
330 continue;
331
332 Count = 0;
333 if (!SetGPRMI) {
334 SetGPRMI = &MI;
335 continue;
336 }
337
338 if (optimizeSetGPR(*SetGPRMI, MI))
339 Changed = true;
340 else
341 SetGPRMI = &MI;
342 }
343 }
344
345 return Changed;
346 }
347