1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
11 /// conflicts.
12 ///
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17 ///
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
24 ///
25 /// The pass tries to reassign registers to reduce bank conflicts.
26 ///
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
30 ///
31 //===----------------------------------------------------------------------===//
32
33 #include "AMDGPU.h"
34 #include "AMDGPUSubtarget.h"
35 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
36 #include "SIInstrInfo.h"
37 #include "SIMachineFunctionInfo.h"
38 #include "Utils/AMDGPUBaseInfo.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/Statistic.h"
41 #include "llvm/CodeGen/LiveInterval.h"
42 #include "llvm/CodeGen/LiveIntervals.h"
43 #include "llvm/CodeGen/LiveRegMatrix.h"
44 #include "llvm/CodeGen/MachineFunctionPass.h"
45 #include "llvm/CodeGen/MachineLoopInfo.h"
46 #include "llvm/CodeGen/VirtRegMap.h"
47 #include "llvm/InitializePasses.h"
48 #include "llvm/Support/MathExtras.h"
49
50 using namespace llvm;
51
52 static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
53 cl::desc("Verify stall cycles in the regbanks reassign pass"),
54 cl::value_desc("0|1|2"),
55 cl::init(0), cl::Hidden);
56
57 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
58
59 #define NUM_VGPR_BANKS 4
60 #define NUM_SGPR_BANKS 8
61 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
62 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
63 #define VGPR_BANK_MASK 0xf
64 #define SGPR_BANK_MASK 0xff0
65 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
66
67 STATISTIC(NumStallsDetected,
68 "Number of operand read stalls detected");
69 STATISTIC(NumStallsRecovered,
70 "Number of operand read stalls recovered");
71
72 namespace {
73
74 class GCNRegBankReassign : public MachineFunctionPass {
75
76 class OperandMask {
77 public:
OperandMask(unsigned r,unsigned s,unsigned m)78 OperandMask(unsigned r, unsigned s, unsigned m)
79 : Reg(r), SubReg(s), Mask(m) {}
80 Register Reg;
81 unsigned SubReg;
82 unsigned Mask;
83 };
84
85 class Candidate {
86 public:
Candidate(MachineInstr * mi,Register reg,unsigned subreg,unsigned freebanks)87 Candidate(MachineInstr *mi, Register reg, unsigned subreg,
88 unsigned freebanks)
89 : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
90
91 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump(const GCNRegBankReassign * P) const92 void dump(const GCNRegBankReassign *P) const {
93 MI->dump();
94 dbgs() << P->printReg(Reg) << " to banks ";
95 dumpFreeBanks(FreeBanks);
96 dbgs() << '\n';
97 }
98 #endif
99
100 MachineInstr *MI;
101 Register Reg;
102 unsigned SubReg;
103 unsigned FreeBanks;
104 };
105
106 class CandidateList : public std::map<unsigned, std::list<Candidate>> {
107 public:
push(unsigned Weight,const Candidate && C)108 void push(unsigned Weight, const Candidate&& C) {
109 operator[](Weight).push_front(C);
110 }
111
back()112 Candidate &back() {
113 return rbegin()->second.back();
114 }
115
pop_back()116 void pop_back() {
117 rbegin()->second.pop_back();
118 if (rbegin()->second.empty())
119 erase(rbegin()->first);
120 }
121
122 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump(const GCNRegBankReassign * P) const123 void dump(const GCNRegBankReassign *P) const {
124 dbgs() << "\nCandidates:\n\n";
125 for (auto &B : *this) {
126 dbgs() << " Weight " << B.first << ":\n";
127 for (auto &C : B.second)
128 C.dump(P);
129 }
130 dbgs() << "\n\n";
131 }
132 #endif
133 };
134
135 public:
136 static char ID;
137
138 public:
GCNRegBankReassign()139 GCNRegBankReassign() : MachineFunctionPass(ID) {
140 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
141 }
142
143 bool runOnMachineFunction(MachineFunction &MF) override;
144
getPassName() const145 StringRef getPassName() const override { return "GCN RegBank Reassign"; }
146
getAnalysisUsage(AnalysisUsage & AU) const147 void getAnalysisUsage(AnalysisUsage &AU) const override {
148 AU.addRequired<MachineLoopInfo>();
149 AU.addRequired<LiveIntervals>();
150 AU.addRequired<VirtRegMap>();
151 AU.addRequired<LiveRegMatrix>();
152 AU.setPreservesAll();
153 MachineFunctionPass::getAnalysisUsage(AU);
154 }
155
156 private:
157 const GCNSubtarget *ST;
158
159 const MachineRegisterInfo *MRI;
160
161 const SIRegisterInfo *TRI;
162
163 MachineLoopInfo *MLI;
164
165 VirtRegMap *VRM;
166
167 LiveRegMatrix *LRM;
168
169 LiveIntervals *LIS;
170
171 unsigned MaxNumVGPRs;
172
173 unsigned MaxNumSGPRs;
174
175 BitVector RegsUsed;
176
177 SmallVector<OperandMask, 8> OperandMasks;
178
179 CandidateList Candidates;
180
181 const MCPhysReg *CSRegs;
182
183 // Returns bank for a phys reg.
184 unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
185
186 // Return a bit set for each register bank used. 4 banks for VGPRs and
187 // 8 banks for SGPRs.
188 // Registers already processed and recorded in RegsUsed are excluded.
189 // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
190 uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
191
192 // Analyze one instruction returning the number of stalls and a mask of the
193 // banks used by all operands.
194 // If Reg and Bank are provided, assume all uses of Reg will be replaced with
195 // a register chosen from Bank.
196 std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
197 Register Reg = Register(),
198 unsigned SubReg = 0, int Bank = -1);
199
200 // Return true if register is regular VGPR or SGPR or their tuples.
201 // Returns false for special registers like m0, vcc etc.
202 bool isReassignable(Register Reg) const;
203
204 // Check if registers' defs are old and may be pre-loaded.
205 // Returns 0 if both registers are old enough, 1 or 2 if one or both
206 // registers will not likely be pre-loaded.
207 unsigned getOperandGatherWeight(const MachineInstr& MI,
208 Register Reg1,
209 Register Reg2,
210 unsigned StallCycles) const;
211
212
213 // Find all bank bits in UsedBanks where Mask can be relocated to.
214 unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
215
216 // Find all bank bits in UsedBanks where Mask can be relocated to.
217 // Bank is relative to the register and not its subregister component.
218 // Returns 0 is a register is not reassignable.
219 unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
220 unsigned UsedBanks) const;
221
222 // Add cadidate instruction to the work list.
223 void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
224 unsigned StallCycles);
225
226 // Collect cadidate instructions across function. Returns a number stall
227 // cycles detected. Only counts stalls if Collect is false.
228 unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
229
230 // Remove all candidates that read specified register.
231 void removeCandidates(Register Reg);
232
233 // Compute stalls within the uses of SrcReg replaced by a register from
234 // Bank. If Bank is -1 does not perform substitution. If Collect is set
235 // candidates are collected and added to work list.
236 unsigned computeStallCycles(Register SrcReg,
237 Register Reg = Register(),
238 unsigned SubReg = 0, int Bank = -1,
239 bool Collect = false);
240
241 // Search for a register in Bank unused within LI.
242 // Returns phys reg or NoRegister.
243 MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
244 unsigned SubReg) const;
245
246 // Try to reassign candidate. Returns number or stall cycles saved.
247 unsigned tryReassign(Candidate &C);
248
249 bool verifyCycles(MachineFunction &MF,
250 unsigned OriginalCycles, unsigned CyclesSaved);
251
252
253 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
254 public:
printReg(Register Reg,unsigned SubReg=0) const255 Printable printReg(Register Reg, unsigned SubReg = 0) const {
256 return Printable([Reg, SubReg, this](raw_ostream &OS) {
257 if (Reg.isPhysical()) {
258 OS << llvm::printReg(Reg, TRI);
259 return;
260 }
261 if (!VRM->isAssignedReg(Reg))
262 OS << "<unassigned> " << llvm::printReg(Reg, TRI);
263 else
264 OS << llvm::printReg(Reg, TRI) << '('
265 << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
266 if (SubReg)
267 OS << ':' << TRI->getSubRegIndexName(SubReg);
268 });
269 }
270
printBank(unsigned Bank)271 static Printable printBank(unsigned Bank) {
272 return Printable([Bank](raw_ostream &OS) {
273 OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
274 });
275 }
276
dumpFreeBanks(unsigned FreeBanks)277 static void dumpFreeBanks(unsigned FreeBanks) {
278 for (unsigned L = 0; L < NUM_BANKS; ++L)
279 if (FreeBanks & (1 << L))
280 dbgs() << printBank(L) << ' ';
281 }
282 #endif
283 };
284
285 } // End anonymous namespace.
286
287 INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
288 false, false)
289 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
290 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
291 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
292 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
293 INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
294 false, false)
295
296
297 char GCNRegBankReassign::ID = 0;
298
299 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
300
getPhysRegBank(Register Reg,unsigned SubReg) const301 unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
302 unsigned SubReg) const {
303 assert(Reg.isPhysical());
304
305 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
306 unsigned Size = TRI->getRegSizeInBits(*RC);
307 if (Size == 16)
308 Reg = TRI->get32BitRegister(Reg);
309 else if (Size > 32) {
310 if (SubReg) {
311 const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
312 Reg = TRI->getSubReg(Reg, SubReg);
313 if (TRI->getRegSizeInBits(*SubRC) > 32)
314 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
315 } else {
316 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
317 }
318 }
319
320 if (TRI->hasVGPRs(RC)) {
321 unsigned RegNo = Reg - AMDGPU::VGPR0;
322 return RegNo % NUM_VGPR_BANKS;
323 }
324
325 unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
326 return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
327 }
328
getRegBankMask(Register Reg,unsigned SubReg,int Bank)329 uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
330 int Bank) {
331 if (Reg.isVirtual()) {
332 if (!VRM->isAssignedReg(Reg))
333 return 0;
334
335 Reg = VRM->getPhys(Reg);
336 if (!Reg)
337 return 0;
338 if (SubReg)
339 Reg = TRI->getSubReg(Reg, SubReg);
340 }
341
342 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
343 unsigned Size = TRI->getRegSizeInBits(*RC);
344
345 if (Size == 16) {
346 Reg = TRI->get32BitRegister(Reg);
347 Size = 1;
348 } else {
349 Size /= 32;
350 if (Size > 1)
351 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
352 }
353
354 if (TRI->hasVGPRs(RC)) {
355 // VGPRs have 4 banks assigned in a round-robin fashion.
356 unsigned RegNo = Reg - AMDGPU::VGPR0;
357 uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
358 unsigned Used = 0;
359 // Bitmask lacks an extract method
360 for (unsigned I = 0; I < Size; ++I)
361 if (RegsUsed.test(RegNo + I))
362 Used |= 1 << I;
363 RegsUsed.set(RegNo, RegNo + Size);
364 Mask &= ~Used;
365 Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
366 return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
367 }
368
369 // SGPRs have 8 banks holding 2 consequitive registers each.
370 unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
371 unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
372 if (RegNo + StartBit >= RegsUsed.size())
373 return 0;
374
375 if (Size > 1)
376 Size /= 2;
377 unsigned Mask = (1 << Size) - 1;
378 unsigned Used = 0;
379 for (unsigned I = 0; I < Size; ++I)
380 if (RegsUsed.test(StartBit + RegNo + I))
381 Used |= 1 << I;
382 RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
383 Mask &= ~Used;
384 Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
385 : unsigned(Bank - SGPR_BANK_OFFSET);
386 Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
387 // Reserve 4 bank ids for VGPRs.
388 return Mask << SGPR_BANK_OFFSET;
389 }
390
391 std::pair<unsigned, unsigned>
analyzeInst(const MachineInstr & MI,Register Reg,unsigned SubReg,int Bank)392 GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
393 unsigned SubReg, int Bank) {
394 unsigned StallCycles = 0;
395 unsigned UsedBanks = 0;
396
397 if (MI.isDebugValue())
398 return std::make_pair(StallCycles, UsedBanks);
399
400 RegsUsed.reset();
401 OperandMasks.clear();
402 for (const auto& Op : MI.explicit_uses()) {
403 // Undef can be assigned to any register, so two vregs can be assigned
404 // the same phys reg within the same instruction.
405 if (!Op.isReg() || Op.isUndef())
406 continue;
407
408 const Register R = Op.getReg();
409 const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
410
411 // Do not compute stalls for AGPRs
412 if (TRI->hasAGPRs(RC))
413 continue;
414
415 // Do not compute stalls if sub-register covers all banks
416 if (Op.getSubReg()) {
417 LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
418 if (TRI->hasVGPRs(RC)) {
419 if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
420 continue;
421 } else {
422 if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
423 continue;
424 }
425 }
426
427 unsigned ShiftedBank = Bank;
428
429 if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
430 unsigned RegOffset =
431 TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
432 unsigned Offset = TRI->getChannelFromSubReg(
433 Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
434 if (Bank < NUM_VGPR_BANKS) {
435 unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
436 ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
437 } else if (Bank >= SGPR_BANK_OFFSET) {
438 unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
439 ShiftedBank = SGPR_BANK_OFFSET +
440 (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
441 }
442 }
443
444 uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
445 (Reg == R) ? ShiftedBank : -1);
446 StallCycles += countPopulation(UsedBanks & Mask);
447 UsedBanks |= Mask;
448 OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
449 }
450
451 return std::make_pair(StallCycles, UsedBanks);
452 }
453
getOperandGatherWeight(const MachineInstr & MI,Register Reg1,Register Reg2,unsigned StallCycles) const454 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
455 Register Reg1,
456 Register Reg2,
457 unsigned StallCycles) const
458 {
459 unsigned Defs = 0;
460 MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
461 MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
462 for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
463 if (MI.isDebugInstr())
464 continue;
465 --Def;
466 if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
467 continue;
468 if (Def->modifiesRegister(Reg1, TRI))
469 Defs |= 1;
470 if (Def->modifiesRegister(Reg2, TRI))
471 Defs |= 2;
472 }
473 return countPopulation(Defs);
474 }
475
isReassignable(Register Reg) const476 bool GCNRegBankReassign::isReassignable(Register Reg) const {
477 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
478 return false;
479
480 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
481
482 Register PhysReg = VRM->getPhys(Reg);
483
484 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
485 return false;
486
487 for (auto U : MRI->use_nodbg_operands(Reg)) {
488 if (U.isImplicit())
489 return false;
490 const MachineInstr *UseInst = U.getParent();
491 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
492 return false;
493 }
494
495 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
496 unsigned Size = TRI->getRegSizeInBits(*RC);
497
498 // TODO: Support 16 bit registers. Those needs to be moved with their
499 // parent VGPR_32 and potentially a sibling 16 bit sub-register.
500 if (Size < 32)
501 return false;
502
503 if (TRI->hasVGPRs(RC))
504 return true;
505
506 if (Size == 16)
507 return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
508
509 if (Size > 32)
510 PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
511
512 return AMDGPU::SGPR_32RegClass.contains(PhysReg);
513 }
514
getFreeBanks(unsigned Mask,unsigned UsedBanks) const515 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
516 unsigned UsedBanks) const {
517 unsigned Size = countPopulation(Mask);
518 unsigned FreeBanks = 0;
519 unsigned Bank = findFirstSet(Mask);
520
521 UsedBanks &= ~Mask;
522
523 // Find free VGPR banks
524 if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
525 for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
526 if (Bank == I)
527 continue;
528 unsigned NewMask = ((1 << Size) - 1) << I;
529 NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
530 if (!(UsedBanks & NewMask))
531 FreeBanks |= 1 << I;
532 }
533 return FreeBanks;
534 }
535
536 // Find free SGPR banks
537 // SGPR tuples must be aligned, so step is size in banks it
538 // crosses.
539 Bank -= SGPR_BANK_OFFSET;
540 for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
541 if (Bank == I)
542 continue;
543 unsigned NewMask = ((1 << Size) - 1) << I;
544 NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
545 if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
546 FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
547 }
548
549 return FreeBanks;
550 }
551
getFreeBanks(Register Reg,unsigned SubReg,unsigned Mask,unsigned UsedBanks) const552 unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
553 unsigned SubReg,
554 unsigned Mask,
555 unsigned UsedBanks) const {
556 if (!isReassignable(Reg))
557 return 0;
558
559 unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
560
561 unsigned Offset = TRI->getChannelFromSubReg(SubReg);
562 if (Offset && (Mask & VGPR_BANK_MASK)) {
563 unsigned Shift = Offset;
564 if (Shift >= NUM_VGPR_BANKS)
565 return 0;
566 unsigned VB = FreeBanks & VGPR_BANK_MASK;
567 FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
568 VGPR_BANK_MASK;
569 } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
570 unsigned Shift = Offset >> 1;
571 if (Shift >= NUM_SGPR_BANKS)
572 return 0;
573 unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
574 FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
575 SGPR_BANK_SHIFTED_MASK;
576 FreeBanks <<= SGPR_BANK_OFFSET;
577 }
578
579 LLVM_DEBUG(if (FreeBanks) {
580 dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
581 << " to banks: "; dumpFreeBanks(FreeBanks);
582 dbgs() << '\n'; });
583
584 return FreeBanks;
585 }
586
collectCandidates(MachineInstr & MI,unsigned UsedBanks,unsigned StallCycles)587 void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
588 unsigned UsedBanks,
589 unsigned StallCycles) {
590 LLVM_DEBUG(MI.dump());
591
592 if (!StallCycles)
593 return;
594
595 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
596
597 for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
598 for (unsigned J = I + 1; J != E; ++J) {
599 if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
600 continue;
601
602 Register Reg1 = OperandMasks[I].Reg;
603 Register Reg2 = OperandMasks[J].Reg;
604 unsigned SubReg1 = OperandMasks[I].SubReg;
605 unsigned SubReg2 = OperandMasks[J].SubReg;
606 unsigned Mask1 = OperandMasks[I].Mask;
607 unsigned Mask2 = OperandMasks[J].Mask;
608 unsigned Size1 = countPopulation(Mask1);
609 unsigned Size2 = countPopulation(Mask2);
610
611 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
612 " and " << printReg(Reg2, SubReg2) << '\n');
613
614 unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
615 Weight += MLI->getLoopDepth(MI.getParent()) * 10;
616
617 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
618
619 unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
620 unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
621 if (FreeBanks1)
622 Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
623 Candidate(&MI, Reg1, SubReg1, FreeBanks1));
624 if (FreeBanks2)
625 Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
626 Candidate(&MI, Reg2, SubReg2, FreeBanks2));
627 }
628 }
629 }
630
computeStallCycles(Register SrcReg,Register Reg,unsigned SubReg,int Bank,bool Collect)631 unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
632 unsigned SubReg, int Bank,
633 bool Collect) {
634 unsigned TotalStallCycles = 0;
635 SmallSet<const MachineInstr *, 16> Visited;
636
637 for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
638 if (MI.isBundle())
639 continue;
640 if (!Visited.insert(&MI).second)
641 continue;
642 unsigned StallCycles;
643 unsigned UsedBanks;
644 std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
645 TotalStallCycles += StallCycles;
646 if (Collect)
647 collectCandidates(MI, UsedBanks, StallCycles);
648 }
649
650 return TotalStallCycles;
651 }
652
scavengeReg(LiveInterval & LI,unsigned Bank,unsigned SubReg) const653 MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
654 unsigned SubReg) const {
655 const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
656 unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
657 : MaxNumSGPRs;
658 unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
659 : AMDGPU::SGPR0);
660
661 for (MCRegister Reg : RC->getRegisters()) {
662 // Check occupancy limit.
663 if (TRI->isSubRegisterEq(Reg, MaxReg))
664 break;
665
666 if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
667 continue;
668
669 for (unsigned I = 0; CSRegs[I]; ++I)
670 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
671 !LRM->isPhysRegUsed(CSRegs[I]))
672 return MCRegister::from(AMDGPU::NoRegister);
673
674 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
675
676 if (!LRM->checkInterference(LI, Reg))
677 return Reg;
678 }
679
680 return MCRegister::from(AMDGPU::NoRegister);
681 }
682
tryReassign(Candidate & C)683 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
684 if (!LIS->hasInterval(C.Reg))
685 return 0;
686
687 LiveInterval &LI = LIS->getInterval(C.Reg);
688 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
689 LI.dump());
690
691 // For each candidate bank walk all instructions in the range of live
692 // interval and check if replacing the register with one belonging to
693 // the candidate bank reduces conflicts.
694
695 unsigned OrigStalls = computeStallCycles(C.Reg);
696 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
697 if (!OrigStalls)
698 return 0;
699
700 struct BankStall {
701 BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
702 bool operator<(const BankStall &RHS) const {
703 if (Stalls == RHS.Stalls)
704 return Bank < RHS.Bank;
705 return Stalls > RHS.Stalls;
706 }
707 unsigned Bank;
708 unsigned Stalls;
709 };
710 SmallVector<BankStall, 8> BankStalls;
711
712 for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
713 if (C.FreeBanks & (1 << Bank)) {
714 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
715 unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
716 if (Stalls < OrigStalls) {
717 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
718 << Stalls << '\n');
719 BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
720 }
721 }
722 }
723 llvm::sort(BankStalls);
724
725 MCRegister OrigReg = VRM->getPhys(C.Reg);
726 LRM->unassign(LI);
727 while (!BankStalls.empty()) {
728 BankStall BS = BankStalls.pop_back_val();
729 MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
730 if (Reg == AMDGPU::NoRegister) {
731 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
732 << '\n');
733 continue;
734 }
735 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
736 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
737 << " in bank " << printBank(BS.Bank) << '\n');
738
739 LRM->assign(LI, Reg);
740
741 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
742
743 return OrigStalls - BS.Stalls;
744 }
745 LRM->assign(LI, OrigReg);
746
747 return 0;
748 }
749
collectCandidates(MachineFunction & MF,bool Collect)750 unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
751 bool Collect) {
752 unsigned TotalStallCycles = 0;
753
754 for (MachineBasicBlock &MBB : MF) {
755
756 LLVM_DEBUG(if (Collect) {
757 if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
758 else dbgs() << MBB.getName(); dbgs() << ":\n";
759 });
760
761 for (MachineInstr &MI : MBB.instrs()) {
762 if (MI.isBundle())
763 continue; // we analyze the instructions inside the bundle individually
764
765 unsigned StallCycles;
766 unsigned UsedBanks;
767 std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
768
769 if (Collect)
770 collectCandidates(MI, UsedBanks, StallCycles);
771
772 TotalStallCycles += StallCycles;
773 }
774
775 LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
776 }
777
778 return TotalStallCycles;
779 }
780
removeCandidates(Register Reg)781 void GCNRegBankReassign::removeCandidates(Register Reg) {
782 typename CandidateList::iterator Next;
783 for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
784 Next = std::next(I);
785 I->second.remove_if([Reg, this](const Candidate& C) {
786 return C.MI->readsRegister(Reg, TRI);
787 });
788 if (I->second.empty())
789 Candidates.erase(I);
790 }
791 }
792
verifyCycles(MachineFunction & MF,unsigned OriginalCycles,unsigned CyclesSaved)793 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
794 unsigned OriginalCycles,
795 unsigned CyclesSaved) {
796 unsigned StallCycles = collectCandidates(MF, false);
797 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
798 << " stall cycles left\n");
799 return StallCycles + CyclesSaved == OriginalCycles;
800 }
801
runOnMachineFunction(MachineFunction & MF)802 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
803 ST = &MF.getSubtarget<GCNSubtarget>();
804 if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
805 return false;
806
807 MRI = &MF.getRegInfo();
808 TRI = ST->getRegisterInfo();
809 MLI = &getAnalysis<MachineLoopInfo>();
810 VRM = &getAnalysis<VirtRegMap>();
811 LRM = &getAnalysis<LiveRegMatrix>();
812 LIS = &getAnalysis<LiveIntervals>();
813
814 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
815 unsigned Occupancy = MFI->getOccupancy();
816 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
817 MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
818 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
819 MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
820
821 CSRegs = MRI->getCalleeSavedRegs();
822 unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
823 // Not a tight bound
824 AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
825 RegsUsed.resize(NumRegBanks);
826
827 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
828 << '\n');
829
830 unsigned StallCycles = collectCandidates(MF);
831 NumStallsDetected += StallCycles;
832
833 LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
834 "function " << MF.getName() << '\n');
835
836 LLVM_DEBUG(Candidates.dump(this));
837
838 unsigned CyclesSaved = 0;
839 while (!Candidates.empty()) {
840 Candidate C = Candidates.back();
841 unsigned LocalCyclesSaved = tryReassign(C);
842 CyclesSaved += LocalCyclesSaved;
843
844 if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
845 report_fatal_error("RegBank reassign stall cycles verification failed.");
846
847 Candidates.pop_back();
848 if (LocalCyclesSaved) {
849 removeCandidates(C.Reg);
850 computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
851
852 LLVM_DEBUG(Candidates.dump(this));
853 }
854 }
855 NumStallsRecovered += CyclesSaved;
856
857 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
858 << " cycles saved in function " << MF.getName() << '\n');
859
860 Candidates.clear();
861
862 if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
863 report_fatal_error("RegBank reassign stall cycles verification failed.");
864
865 RegsUsed.clear();
866
867 return CyclesSaved > 0;
868 }
869