1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs exec mask handling peephole optimizations which needs
11 /// to be done before register allocation to reduce register pressure.
12 ///
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIInstrInfo.h"
19 #include "llvm/CodeGen/LiveIntervals.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/InitializePasses.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
26
27 namespace {
28
29 class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
30 private:
31 const SIRegisterInfo *TRI;
32 const SIInstrInfo *TII;
33 MachineRegisterInfo *MRI;
34 LiveIntervals *LIS;
35
36 unsigned AndOpc;
37 unsigned Andn2Opc;
38 unsigned OrSaveExecOpc;
39 unsigned XorTermrOpc;
40 MCRegister CondReg;
41 MCRegister ExecReg;
42
43 Register optimizeVcndVcmpPair(MachineBasicBlock &MBB);
44 bool optimizeElseBranch(MachineBasicBlock &MBB);
45
46 public:
47 static char ID;
48
SIOptimizeExecMaskingPreRA()49 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
50 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
51 }
52
53 bool runOnMachineFunction(MachineFunction &MF) override;
54
getPassName() const55 StringRef getPassName() const override {
56 return "SI optimize exec mask operations pre-RA";
57 }
58
getAnalysisUsage(AnalysisUsage & AU) const59 void getAnalysisUsage(AnalysisUsage &AU) const override {
60 AU.addRequired<LiveIntervals>();
61 AU.setPreservesAll();
62 MachineFunctionPass::getAnalysisUsage(AU);
63 }
64 };
65
66 } // End anonymous namespace.
67
68 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
69 "SI optimize exec mask operations pre-RA", false, false)
70 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
71 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
72 "SI optimize exec mask operations pre-RA", false, false)
73
74 char SIOptimizeExecMaskingPreRA::ID = 0;
75
76 char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
77
createSIOptimizeExecMaskingPreRAPass()78 FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
79 return new SIOptimizeExecMaskingPreRA();
80 }
81
82 // See if there is a def between \p AndIdx and \p SelIdx that needs to live
83 // beyond \p AndIdx.
isDefBetween(const LiveRange & LR,SlotIndex AndIdx,SlotIndex SelIdx)84 static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
85 SlotIndex SelIdx) {
86 LiveQueryResult AndLRQ = LR.Query(AndIdx);
87 return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
88 }
89
90 // FIXME: Why do we bother trying to handle physical registers here?
isDefBetween(const SIRegisterInfo & TRI,LiveIntervals * LIS,Register Reg,const MachineInstr & Sel,const MachineInstr & And)91 static bool isDefBetween(const SIRegisterInfo &TRI,
92 LiveIntervals *LIS, Register Reg,
93 const MachineInstr &Sel, const MachineInstr &And) {
94 SlotIndex AndIdx = LIS->getInstructionIndex(And);
95 SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
96
97 if (Reg.isVirtual())
98 return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
99
100 for (MCRegUnitIterator UI(Reg.asMCReg(), &TRI); UI.isValid(); ++UI) {
101 if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx))
102 return true;
103 }
104
105 return false;
106 }
107
108 // Optimize sequence
109 // %sel = V_CNDMASK_B32_e64 0, 1, %cc
110 // %cmp = V_CMP_NE_U32 1, %1
111 // $vcc = S_AND_B64 $exec, %cmp
112 // S_CBRANCH_VCC[N]Z
113 // =>
114 // $vcc = S_ANDN2_B64 $exec, %cc
115 // S_CBRANCH_VCC[N]Z
116 //
117 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
118 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
119 // only 3 first instructions are really needed. S_AND_B64 with exec is a
120 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
121 // lanes.
122 //
123 // Returns %cc register on success.
124 Register
optimizeVcndVcmpPair(MachineBasicBlock & MBB)125 SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
126 auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
127 unsigned Opc = MI.getOpcode();
128 return Opc == AMDGPU::S_CBRANCH_VCCZ ||
129 Opc == AMDGPU::S_CBRANCH_VCCNZ; });
130 if (I == MBB.terminators().end())
131 return Register();
132
133 auto *And =
134 TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
135 if (!And || And->getOpcode() != AndOpc ||
136 !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
137 return Register();
138
139 MachineOperand *AndCC = &And->getOperand(1);
140 Register CmpReg = AndCC->getReg();
141 unsigned CmpSubReg = AndCC->getSubReg();
142 if (CmpReg == Register(ExecReg)) {
143 AndCC = &And->getOperand(2);
144 CmpReg = AndCC->getReg();
145 CmpSubReg = AndCC->getSubReg();
146 } else if (And->getOperand(2).getReg() != Register(ExecReg)) {
147 return Register();
148 }
149
150 auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS);
151 if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
152 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
153 Cmp->getParent() != And->getParent())
154 return Register();
155
156 MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
157 MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
158 if (Op1->isImm() && Op2->isReg())
159 std::swap(Op1, Op2);
160 if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
161 return Register();
162
163 Register SelReg = Op1->getReg();
164 auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
165 if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
166 return Register();
167
168 if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
169 TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
170 return Register();
171
172 Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
173 Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
174 MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
175 if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
176 Op1->getImm() != 0 || Op2->getImm() != 1)
177 return Register();
178
179 Register CCReg = CC->getReg();
180
181 // If there was a def between the select and the and, we would need to move it
182 // to fold this.
183 if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
184 return Register();
185
186 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
187 << *And);
188
189 LIS->RemoveMachineInstrFromMaps(*And);
190 MachineInstr *Andn2 =
191 BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
192 And->getOperand(0).getReg())
193 .addReg(ExecReg)
194 .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
195 MachineOperand &AndSCC = And->getOperand(3);
196 assert(AndSCC.getReg() == AMDGPU::SCC);
197 MachineOperand &Andn2SCC = Andn2->getOperand(3);
198 assert(Andn2SCC.getReg() == AMDGPU::SCC);
199 Andn2SCC.setIsDead(AndSCC.isDead());
200 And->eraseFromParent();
201 LIS->InsertMachineInstrInMaps(*Andn2);
202
203 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
204
205 // Try to remove compare. Cmp value should not used in between of cmp
206 // and s_and_b64 if VCC or just unused if any other register.
207 if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) ||
208 (CmpReg == Register(CondReg) &&
209 std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
210 [&](const MachineInstr &MI) {
211 return MI.readsRegister(CondReg, TRI);
212 }))) {
213 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
214
215 LIS->RemoveMachineInstrFromMaps(*Cmp);
216 Cmp->eraseFromParent();
217
218 // Try to remove v_cndmask_b32.
219 if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) {
220 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
221
222 LIS->RemoveMachineInstrFromMaps(*Sel);
223 Sel->eraseFromParent();
224 }
225 }
226
227 return CCReg;
228 }
229
230 // Optimize sequence
231 // %dst = S_OR_SAVEEXEC %src
232 // ... instructions not modifying exec ...
233 // %tmp = S_AND $exec, %dst
234 // $exec = S_XOR_term $exec, %tmp
235 // =>
236 // %dst = S_OR_SAVEEXEC %src
237 // ... instructions not modifying exec ...
238 // $exec = S_XOR_term $exec, %dst
239 //
240 // Clean up potentially unnecessary code added for safety during
241 // control flow lowering.
242 //
243 // Return whether any changes were made to MBB.
optimizeElseBranch(MachineBasicBlock & MBB)244 bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
245 if (MBB.empty())
246 return false;
247
248 // Check this is an else block.
249 auto First = MBB.begin();
250 MachineInstr &SaveExecMI = *First;
251 if (SaveExecMI.getOpcode() != OrSaveExecOpc)
252 return false;
253
254 auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
255 return MI.getOpcode() == XorTermrOpc;
256 });
257 if (I == MBB.terminators().end())
258 return false;
259
260 MachineInstr &XorTermMI = *I;
261 if (XorTermMI.getOperand(1).getReg() != Register(ExecReg))
262 return false;
263
264 Register SavedExecReg = SaveExecMI.getOperand(0).getReg();
265 Register DstReg = XorTermMI.getOperand(2).getReg();
266
267 // Find potentially unnecessary S_AND
268 MachineInstr *AndExecMI = nullptr;
269 I--;
270 while (I != First && !AndExecMI) {
271 if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
272 I->getOperand(1).getReg() == Register(ExecReg))
273 AndExecMI = &*I;
274 I--;
275 }
276 if (!AndExecMI)
277 return false;
278
279 // Check for exec modifying instructions.
280 // Note: exec defs do not create live ranges beyond the
281 // instruction so isDefBetween cannot be used.
282 // Instead just check that the def segments are adjacent.
283 SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
284 SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
285 for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
286 LiveRange &RegUnit = LIS->getRegUnit(*UI);
287 if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
288 return false;
289 }
290
291 // Remove unnecessary S_AND
292 LIS->removeInterval(SavedExecReg);
293 LIS->removeInterval(DstReg);
294
295 SaveExecMI.getOperand(0).setReg(DstReg);
296
297 LIS->RemoveMachineInstrFromMaps(*AndExecMI);
298 AndExecMI->eraseFromParent();
299
300 LIS->createAndComputeVirtRegInterval(DstReg);
301
302 return true;
303 }
304
runOnMachineFunction(MachineFunction & MF)305 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
306 if (skipFunction(MF.getFunction()))
307 return false;
308
309 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
310 TRI = ST.getRegisterInfo();
311 TII = ST.getInstrInfo();
312 MRI = &MF.getRegInfo();
313 LIS = &getAnalysis<LiveIntervals>();
314
315 const bool Wave32 = ST.isWave32();
316 AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
317 Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
318 OrSaveExecOpc =
319 Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
320 XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
321 CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
322 ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
323
324 DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
325 bool Changed = false;
326
327 for (MachineBasicBlock &MBB : MF) {
328
329 if (optimizeElseBranch(MBB)) {
330 RecalcRegs.insert(AMDGPU::SCC);
331 Changed = true;
332 }
333
334 if (Register Reg = optimizeVcndVcmpPair(MBB)) {
335 RecalcRegs.insert(Reg);
336 RecalcRegs.insert(AMDGPU::VCC_LO);
337 RecalcRegs.insert(AMDGPU::VCC_HI);
338 RecalcRegs.insert(AMDGPU::SCC);
339 Changed = true;
340 }
341
342 // Try to remove unneeded instructions before s_endpgm.
343 if (MBB.succ_empty()) {
344 if (MBB.empty())
345 continue;
346
347 // Skip this if the endpgm has any implicit uses, otherwise we would need
348 // to be careful to update / remove them.
349 // S_ENDPGM always has a single imm operand that is not used other than to
350 // end up in the encoding
351 MachineInstr &Term = MBB.back();
352 if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1)
353 continue;
354
355 SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
356
357 while (!Blocks.empty()) {
358 auto CurBB = Blocks.pop_back_val();
359 auto I = CurBB->rbegin(), E = CurBB->rend();
360 if (I != E) {
361 if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
362 ++I;
363 else if (I->isBranch())
364 continue;
365 }
366
367 while (I != E) {
368 if (I->isDebugInstr()) {
369 I = std::next(I);
370 continue;
371 }
372
373 if (I->mayStore() || I->isBarrier() || I->isCall() ||
374 I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
375 break;
376
377 LLVM_DEBUG(dbgs()
378 << "Removing no effect instruction: " << *I << '\n');
379
380 for (auto &Op : I->operands()) {
381 if (Op.isReg())
382 RecalcRegs.insert(Op.getReg());
383 }
384
385 auto Next = std::next(I);
386 LIS->RemoveMachineInstrFromMaps(*I);
387 I->eraseFromParent();
388 I = Next;
389
390 Changed = true;
391 }
392
393 if (I != E)
394 continue;
395
396 // Try to ascend predecessors.
397 for (auto *Pred : CurBB->predecessors()) {
398 if (Pred->succ_size() == 1)
399 Blocks.push_back(Pred);
400 }
401 }
402 continue;
403 }
404
405 // If the only user of a logical operation is move to exec, fold it now
406 // to prevent forming of saveexec. I.e:
407 //
408 // %0:sreg_64 = COPY $exec
409 // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
410 // =>
411 // %1 = S_AND_B64 $exec, %2:sreg_64
412 unsigned ScanThreshold = 10;
413 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
414 && ScanThreshold--; ++I) {
415 // Continue scanning if this is not a full exec copy
416 if (!(I->isFullCopy() && I->getOperand(1).getReg() == Register(ExecReg)))
417 continue;
418
419 Register SavedExec = I->getOperand(0).getReg();
420 if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) &&
421 MRI->use_instr_nodbg_begin(SavedExec)->getParent() ==
422 I->getParent()) {
423 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
424 LIS->RemoveMachineInstrFromMaps(*I);
425 I->eraseFromParent();
426 MRI->replaceRegWith(SavedExec, ExecReg);
427 LIS->removeInterval(SavedExec);
428 Changed = true;
429 }
430 break;
431 }
432 }
433
434 if (Changed) {
435 for (auto Reg : RecalcRegs) {
436 if (Reg.isVirtual()) {
437 LIS->removeInterval(Reg);
438 if (!MRI->reg_empty(Reg))
439 LIS->createAndComputeVirtRegInterval(Reg);
440 } else {
441 LIS->removeAllRegUnitsForPhysReg(Reg);
442 }
443 }
444 }
445
446 return Changed;
447 }
448