1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
11 /// conflicts.
12 ///
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17 ///
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
24 ///
25 /// The pass tries to reassign registers to reduce bank conflicts.
26 ///
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
30 ///
31 //===----------------------------------------------------------------------===//
32 
33 #include "AMDGPU.h"
34 #include "AMDGPUSubtarget.h"
35 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
36 #include "SIInstrInfo.h"
37 #include "SIMachineFunctionInfo.h"
38 #include "Utils/AMDGPUBaseInfo.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/Statistic.h"
41 #include "llvm/CodeGen/LiveInterval.h"
42 #include "llvm/CodeGen/LiveIntervals.h"
43 #include "llvm/CodeGen/LiveRegMatrix.h"
44 #include "llvm/CodeGen/MachineFunctionPass.h"
45 #include "llvm/CodeGen/MachineLoopInfo.h"
46 #include "llvm/CodeGen/VirtRegMap.h"
47 #include "llvm/InitializePasses.h"
48 #include "llvm/Support/MathExtras.h"
49 
50 using namespace llvm;
51 
52 static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
53   cl::desc("Verify stall cycles in the regbanks reassign pass"),
54   cl::value_desc("0|1|2"),
55   cl::init(0), cl::Hidden);
56 
57 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
58 
59 #define NUM_VGPR_BANKS 4
60 #define NUM_SGPR_BANKS 8
61 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
62 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
63 #define VGPR_BANK_MASK 0xf
64 #define SGPR_BANK_MASK 0xff0
65 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
66 
67 STATISTIC(NumStallsDetected,
68           "Number of operand read stalls detected");
69 STATISTIC(NumStallsRecovered,
70           "Number of operand read stalls recovered");
71 
72 namespace {
73 
74 class GCNRegBankReassign : public MachineFunctionPass {
75 
76   class OperandMask {
77   public:
OperandMask(unsigned r,unsigned s,unsigned m)78     OperandMask(unsigned r, unsigned s, unsigned m)
79       : Reg(r), SubReg(s), Mask(m) {}
80     Register Reg;
81     unsigned SubReg;
82     unsigned Mask;
83   };
84 
85   class Candidate {
86   public:
Candidate(MachineInstr * mi,Register reg,unsigned subreg,unsigned freebanks)87     Candidate(MachineInstr *mi, Register reg, unsigned subreg,
88               unsigned freebanks)
89         : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
90 
91 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump(const GCNRegBankReassign * P) const92     void dump(const GCNRegBankReassign *P) const {
93       MI->dump();
94       dbgs() << P->printReg(Reg) << " to banks ";
95       dumpFreeBanks(FreeBanks);
96       dbgs() << '\n';
97     }
98 #endif
99 
100     MachineInstr *MI;
101     Register Reg;
102     unsigned SubReg;
103     unsigned FreeBanks;
104   };
105 
106   class CandidateList : public std::map<unsigned, std::list<Candidate>> {
107   public:
push(unsigned Weight,const Candidate && C)108     void push(unsigned Weight, const Candidate&& C) {
109       operator[](Weight).push_front(C);
110     }
111 
back()112     Candidate &back() {
113       return rbegin()->second.back();
114     }
115 
pop_back()116     void pop_back() {
117       rbegin()->second.pop_back();
118       if (rbegin()->second.empty())
119         erase(rbegin()->first);
120     }
121 
122 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump(const GCNRegBankReassign * P) const123     void dump(const GCNRegBankReassign *P) const {
124       dbgs() << "\nCandidates:\n\n";
125       for (auto &B : *this) {
126         dbgs() << " Weight " << B.first << ":\n";
127         for (auto &C : B.second)
128           C.dump(P);
129       }
130       dbgs() << "\n\n";
131     }
132 #endif
133   };
134 
135 public:
136   static char ID;
137 
138 public:
GCNRegBankReassign()139   GCNRegBankReassign() : MachineFunctionPass(ID) {
140     initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
141   }
142 
143   bool runOnMachineFunction(MachineFunction &MF) override;
144 
getPassName() const145   StringRef getPassName() const override { return "GCN RegBank Reassign"; }
146 
getAnalysisUsage(AnalysisUsage & AU) const147   void getAnalysisUsage(AnalysisUsage &AU) const override {
148     AU.addRequired<MachineLoopInfo>();
149     AU.addRequired<LiveIntervals>();
150     AU.addRequired<VirtRegMap>();
151     AU.addRequired<LiveRegMatrix>();
152     AU.setPreservesAll();
153     MachineFunctionPass::getAnalysisUsage(AU);
154   }
155 
156 private:
157   const GCNSubtarget *ST;
158 
159   const MachineRegisterInfo *MRI;
160 
161   const SIRegisterInfo *TRI;
162 
163   MachineLoopInfo *MLI;
164 
165   VirtRegMap *VRM;
166 
167   LiveRegMatrix *LRM;
168 
169   LiveIntervals *LIS;
170 
171   unsigned MaxNumVGPRs;
172 
173   unsigned MaxNumSGPRs;
174 
175   BitVector RegsUsed;
176 
177   SmallVector<OperandMask, 8> OperandMasks;
178 
179   CandidateList Candidates;
180 
181   const MCPhysReg *CSRegs;
182 
183   // Returns bank for a phys reg.
184   unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
185 
186   // Return a bit set for each register bank used. 4 banks for VGPRs and
187   // 8 banks for SGPRs.
188   // Registers already processed and recorded in RegsUsed are excluded.
189   // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
190   uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
191 
192   // Analyze one instruction returning the number of stalls and a mask of the
193   // banks used by all operands.
194   // If Reg and Bank are provided, assume all uses of Reg will be replaced with
195   // a register chosen from Bank.
196   std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
197                                             Register Reg = Register(),
198                                             unsigned SubReg = 0, int Bank = -1);
199 
200   // Return true if register is regular VGPR or SGPR or their tuples.
201   // Returns false for special registers like m0, vcc etc.
202   bool isReassignable(Register Reg) const;
203 
204   // Check if registers' defs are old and may be pre-loaded.
205   // Returns 0 if both registers are old enough, 1 or 2 if one or both
206   // registers will not likely be pre-loaded.
207   unsigned getOperandGatherWeight(const MachineInstr& MI,
208                                   Register Reg1,
209                                   Register Reg2,
210                                   unsigned StallCycles) const;
211 
212 
213   // Find all bank bits in UsedBanks where Mask can be relocated to.
214   unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
215 
216   // Find all bank bits in UsedBanks where Mask can be relocated to.
217   // Bank is relative to the register and not its subregister component.
218   // Returns 0 is a register is not reassignable.
219   unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
220                         unsigned UsedBanks) const;
221 
222   // Add cadidate instruction to the work list.
223   void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
224                          unsigned StallCycles);
225 
226   // Collect cadidate instructions across function. Returns a number stall
227   // cycles detected. Only counts stalls if Collect is false.
228   unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
229 
230   // Remove all candidates that read specified register.
231   void removeCandidates(Register Reg);
232 
233   // Compute stalls within the uses of SrcReg replaced by a register from
234   // Bank. If Bank is -1 does not perform substitution. If Collect is set
235   // candidates are collected and added to work list.
236   unsigned computeStallCycles(Register SrcReg,
237                               Register Reg = Register(),
238                               unsigned SubReg = 0, int Bank = -1,
239                               bool Collect = false);
240 
241   // Search for a register in Bank unused within LI.
242   // Returns phys reg or NoRegister.
243   MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
244                          unsigned SubReg) const;
245 
246   // Try to reassign candidate. Returns number or stall cycles saved.
247   unsigned tryReassign(Candidate &C);
248 
249   bool verifyCycles(MachineFunction &MF,
250                     unsigned OriginalCycles, unsigned CyclesSaved);
251 
252 
253 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
254 public:
printReg(Register Reg,unsigned SubReg=0) const255   Printable printReg(Register Reg, unsigned SubReg = 0) const {
256     return Printable([Reg, SubReg, this](raw_ostream &OS) {
257       if (Reg.isPhysical()) {
258         OS << llvm::printReg(Reg, TRI);
259         return;
260       }
261       if (!VRM->isAssignedReg(Reg))
262         OS << "<unassigned> " << llvm::printReg(Reg, TRI);
263       else
264         OS << llvm::printReg(Reg, TRI) << '('
265            << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
266       if (SubReg)
267         OS << ':' << TRI->getSubRegIndexName(SubReg);
268     });
269   }
270 
printBank(unsigned Bank)271   static Printable printBank(unsigned Bank) {
272     return Printable([Bank](raw_ostream &OS) {
273       OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
274     });
275   }
276 
dumpFreeBanks(unsigned FreeBanks)277   static void dumpFreeBanks(unsigned FreeBanks) {
278     for (unsigned L = 0; L < NUM_BANKS; ++L)
279       if (FreeBanks & (1 << L))
280         dbgs() << printBank(L) << ' ';
281   }
282 #endif
283 };
284 
285 } // End anonymous namespace.
286 
287 INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
288                       false, false)
289 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
290 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
291 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
292 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
293 INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
294                     false, false)
295 
296 
297 char GCNRegBankReassign::ID = 0;
298 
299 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
300 
getPhysRegBank(Register Reg,unsigned SubReg) const301 unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
302                                             unsigned SubReg) const {
303   assert(Reg.isPhysical());
304 
305   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
306   unsigned Size = TRI->getRegSizeInBits(*RC);
307   if (Size == 16)
308     Reg = TRI->get32BitRegister(Reg);
309   else if (Size > 32) {
310     if (SubReg) {
311       const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
312       Reg = TRI->getSubReg(Reg, SubReg);
313       if (TRI->getRegSizeInBits(*SubRC) > 32)
314         Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
315     } else {
316       Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
317     }
318   }
319 
320   if (TRI->hasVGPRs(RC)) {
321     unsigned RegNo = Reg - AMDGPU::VGPR0;
322     return RegNo % NUM_VGPR_BANKS;
323   }
324 
325   unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
326   return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
327 }
328 
getRegBankMask(Register Reg,unsigned SubReg,int Bank)329 uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
330                                             int Bank) {
331   if (Reg.isVirtual()) {
332     if (!VRM->isAssignedReg(Reg))
333       return 0;
334 
335     Reg = VRM->getPhys(Reg);
336     if (!Reg)
337       return 0;
338     if (SubReg)
339       Reg = TRI->getSubReg(Reg, SubReg);
340   }
341 
342   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
343   unsigned Size = TRI->getRegSizeInBits(*RC);
344 
345   if (Size == 16) {
346     Reg = TRI->get32BitRegister(Reg);
347     Size = 1;
348   } else {
349     Size /= 32;
350     if (Size > 1)
351       Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
352   }
353 
354   if (TRI->hasVGPRs(RC)) {
355     // VGPRs have 4 banks assigned in a round-robin fashion.
356     unsigned RegNo = Reg - AMDGPU::VGPR0;
357     uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
358     unsigned Used = 0;
359     // Bitmask lacks an extract method
360     for (unsigned I = 0; I < Size; ++I)
361       if (RegsUsed.test(RegNo + I))
362         Used |= 1 << I;
363     RegsUsed.set(RegNo, RegNo + Size);
364     Mask &= ~Used;
365     Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
366     return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
367   }
368 
369   // SGPRs have 8 banks holding 2 consequitive registers each.
370   unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
371   unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
372   if (RegNo + StartBit >= RegsUsed.size())
373     return 0;
374 
375   if (Size > 1)
376     Size /= 2;
377   unsigned Mask = (1 << Size) - 1;
378   unsigned Used = 0;
379   for (unsigned I = 0; I < Size; ++I)
380     if (RegsUsed.test(StartBit + RegNo + I))
381       Used |= 1 << I;
382   RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
383   Mask &= ~Used;
384   Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
385                         : unsigned(Bank - SGPR_BANK_OFFSET);
386   Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
387   // Reserve 4 bank ids for VGPRs.
388   return Mask << SGPR_BANK_OFFSET;
389 }
390 
391 std::pair<unsigned, unsigned>
analyzeInst(const MachineInstr & MI,Register Reg,unsigned SubReg,int Bank)392 GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
393                                 unsigned SubReg, int Bank) {
394   unsigned StallCycles = 0;
395   unsigned UsedBanks = 0;
396 
397   if (MI.isDebugValue())
398     return std::make_pair(StallCycles, UsedBanks);
399 
400   RegsUsed.reset();
401   OperandMasks.clear();
402   for (const auto& Op : MI.explicit_uses()) {
403     // Undef can be assigned to any register, so two vregs can be assigned
404     // the same phys reg within the same instruction.
405     if (!Op.isReg() || Op.isUndef())
406       continue;
407 
408     const Register R = Op.getReg();
409     const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
410 
411     // Do not compute stalls for AGPRs
412     if (TRI->hasAGPRs(RC))
413       continue;
414 
415     // Do not compute stalls if sub-register covers all banks
416     if (Op.getSubReg()) {
417       LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
418       if (TRI->hasVGPRs(RC)) {
419         if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
420           continue;
421       } else {
422         if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
423           continue;
424       }
425     }
426 
427     unsigned ShiftedBank = Bank;
428 
429     if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
430       unsigned RegOffset =
431           TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
432       unsigned Offset = TRI->getChannelFromSubReg(
433           Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
434       if (Bank < NUM_VGPR_BANKS) {
435         unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
436         ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
437       } else if (Bank >= SGPR_BANK_OFFSET) {
438         unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
439         ShiftedBank = SGPR_BANK_OFFSET +
440                       (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
441       }
442     }
443 
444     uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
445                                    (Reg == R) ? ShiftedBank : -1);
446     StallCycles += countPopulation(UsedBanks & Mask);
447     UsedBanks |= Mask;
448     OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
449   }
450 
451   return std::make_pair(StallCycles, UsedBanks);
452 }
453 
getOperandGatherWeight(const MachineInstr & MI,Register Reg1,Register Reg2,unsigned StallCycles) const454 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
455                                                     Register Reg1,
456                                                     Register Reg2,
457                                                     unsigned StallCycles) const
458 {
459   unsigned Defs = 0;
460   MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
461   MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
462   for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
463     if (MI.isDebugInstr())
464       continue;
465     --Def;
466     if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
467       continue;
468     if (Def->modifiesRegister(Reg1, TRI))
469       Defs |= 1;
470     if (Def->modifiesRegister(Reg2, TRI))
471       Defs |= 2;
472   }
473   return countPopulation(Defs);
474 }
475 
isReassignable(Register Reg) const476 bool GCNRegBankReassign::isReassignable(Register Reg) const {
477   if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
478     return false;
479 
480   const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
481 
482   Register PhysReg = VRM->getPhys(Reg);
483 
484   if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
485     return false;
486 
487   for (auto U : MRI->use_nodbg_operands(Reg)) {
488     if (U.isImplicit())
489       return false;
490     const MachineInstr *UseInst = U.getParent();
491     if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
492       return false;
493   }
494 
495   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
496   unsigned Size = TRI->getRegSizeInBits(*RC);
497 
498   // TODO: Support 16 bit registers. Those needs to be moved with their
499   //       parent VGPR_32 and potentially a sibling 16 bit sub-register.
500   if (Size < 32)
501     return false;
502 
503   if (TRI->hasVGPRs(RC))
504     return true;
505 
506   if (Size == 16)
507     return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
508 
509   if (Size > 32)
510     PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
511 
512   return AMDGPU::SGPR_32RegClass.contains(PhysReg);
513 }
514 
getFreeBanks(unsigned Mask,unsigned UsedBanks) const515 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
516                                           unsigned UsedBanks) const {
517   unsigned Size = countPopulation(Mask);
518   unsigned FreeBanks = 0;
519   unsigned Bank = findFirstSet(Mask);
520 
521   UsedBanks &= ~Mask;
522 
523   // Find free VGPR banks
524   if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
525     for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
526       if (Bank == I)
527         continue;
528       unsigned NewMask = ((1 << Size) - 1) << I;
529       NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
530       if (!(UsedBanks & NewMask))
531         FreeBanks |= 1 << I;
532     }
533     return FreeBanks;
534   }
535 
536   // Find free SGPR banks
537   // SGPR tuples must be aligned, so step is size in banks it
538   // crosses.
539   Bank -= SGPR_BANK_OFFSET;
540   for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
541     if (Bank == I)
542       continue;
543     unsigned NewMask = ((1 << Size) - 1) << I;
544     NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
545     if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
546       FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
547   }
548 
549   return FreeBanks;
550 }
551 
getFreeBanks(Register Reg,unsigned SubReg,unsigned Mask,unsigned UsedBanks) const552 unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
553                                           unsigned SubReg,
554                                           unsigned Mask,
555                                           unsigned UsedBanks) const {
556   if (!isReassignable(Reg))
557     return 0;
558 
559   unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
560 
561   unsigned Offset = TRI->getChannelFromSubReg(SubReg);
562   if (Offset && (Mask & VGPR_BANK_MASK)) {
563     unsigned Shift = Offset;
564     if (Shift >= NUM_VGPR_BANKS)
565       return 0;
566     unsigned VB = FreeBanks & VGPR_BANK_MASK;
567     FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
568                 VGPR_BANK_MASK;
569   } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
570     unsigned Shift = Offset >> 1;
571     if (Shift >= NUM_SGPR_BANKS)
572       return 0;
573     unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
574     FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
575                 SGPR_BANK_SHIFTED_MASK;
576     FreeBanks <<= SGPR_BANK_OFFSET;
577   }
578 
579   LLVM_DEBUG(if (FreeBanks) {
580           dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
581                  << " to banks: "; dumpFreeBanks(FreeBanks);
582           dbgs() << '\n'; });
583 
584   return FreeBanks;
585 }
586 
collectCandidates(MachineInstr & MI,unsigned UsedBanks,unsigned StallCycles)587 void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
588                                            unsigned UsedBanks,
589                                            unsigned StallCycles) {
590   LLVM_DEBUG(MI.dump());
591 
592   if (!StallCycles)
593     return;
594 
595   LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
596 
597   for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
598     for (unsigned J = I + 1; J != E; ++J) {
599       if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
600         continue;
601 
602       Register Reg1 = OperandMasks[I].Reg;
603       Register Reg2 = OperandMasks[J].Reg;
604       unsigned SubReg1 = OperandMasks[I].SubReg;
605       unsigned SubReg2 = OperandMasks[J].SubReg;
606       unsigned Mask1 = OperandMasks[I].Mask;
607       unsigned Mask2 = OperandMasks[J].Mask;
608       unsigned Size1 = countPopulation(Mask1);
609       unsigned Size2 = countPopulation(Mask2);
610 
611       LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
612                       " and " << printReg(Reg2, SubReg2) << '\n');
613 
614       unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
615       Weight += MLI->getLoopDepth(MI.getParent()) * 10;
616 
617       LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
618 
619       unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
620       unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
621       if (FreeBanks1)
622         Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
623                         Candidate(&MI, Reg1, SubReg1, FreeBanks1));
624       if (FreeBanks2)
625         Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
626                         Candidate(&MI, Reg2, SubReg2, FreeBanks2));
627     }
628   }
629 }
630 
computeStallCycles(Register SrcReg,Register Reg,unsigned SubReg,int Bank,bool Collect)631 unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
632                                                 unsigned SubReg, int Bank,
633                                                 bool Collect) {
634   unsigned TotalStallCycles = 0;
635   SmallSet<const MachineInstr *, 16> Visited;
636 
637   for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
638     if (MI.isBundle())
639       continue;
640     if (!Visited.insert(&MI).second)
641       continue;
642     unsigned StallCycles;
643     unsigned UsedBanks;
644     std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
645     TotalStallCycles += StallCycles;
646     if (Collect)
647       collectCandidates(MI, UsedBanks, StallCycles);
648   }
649 
650   return TotalStallCycles;
651 }
652 
scavengeReg(LiveInterval & LI,unsigned Bank,unsigned SubReg) const653 MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
654                                            unsigned SubReg) const {
655   const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
656   unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
657                                                 : MaxNumSGPRs;
658   unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
659                                                         : AMDGPU::SGPR0);
660 
661   for (MCRegister Reg : RC->getRegisters()) {
662     // Check occupancy limit.
663     if (TRI->isSubRegisterEq(Reg, MaxReg))
664       break;
665 
666     if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
667       continue;
668 
669     for (unsigned I = 0; CSRegs[I]; ++I)
670       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
671           !LRM->isPhysRegUsed(CSRegs[I]))
672         return MCRegister::from(AMDGPU::NoRegister);
673 
674     LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
675 
676     if (!LRM->checkInterference(LI, Reg))
677       return Reg;
678   }
679 
680   return MCRegister::from(AMDGPU::NoRegister);
681 }
682 
tryReassign(Candidate & C)683 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
684   if (!LIS->hasInterval(C.Reg))
685     return 0;
686 
687   LiveInterval &LI = LIS->getInterval(C.Reg);
688   LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
689              LI.dump());
690 
691   // For each candidate bank walk all instructions in the range of live
692   // interval and check if replacing the register with one belonging to
693   // the candidate bank reduces conflicts.
694 
695   unsigned OrigStalls = computeStallCycles(C.Reg);
696   LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
697   if (!OrigStalls)
698     return 0;
699 
700   struct BankStall {
701     BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
702     bool operator<(const BankStall &RHS) const {
703       if (Stalls == RHS.Stalls)
704         return Bank < RHS.Bank;
705       return Stalls > RHS.Stalls;
706     }
707     unsigned Bank;
708     unsigned Stalls;
709   };
710   SmallVector<BankStall, 8> BankStalls;
711 
712   for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
713     if (C.FreeBanks & (1 << Bank)) {
714       LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
715       unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
716       if (Stalls < OrigStalls) {
717         LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
718                      << Stalls << '\n');
719         BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
720       }
721     }
722   }
723   llvm::sort(BankStalls);
724 
725   MCRegister OrigReg = VRM->getPhys(C.Reg);
726   LRM->unassign(LI);
727   while (!BankStalls.empty()) {
728     BankStall BS = BankStalls.pop_back_val();
729     MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
730     if (Reg == AMDGPU::NoRegister) {
731       LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
732                    << '\n');
733       continue;
734     }
735     LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
736                  << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
737                  << " in bank " << printBank(BS.Bank) << '\n');
738 
739     LRM->assign(LI, Reg);
740 
741     LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
742 
743     return OrigStalls - BS.Stalls;
744   }
745   LRM->assign(LI, OrigReg);
746 
747   return 0;
748 }
749 
collectCandidates(MachineFunction & MF,bool Collect)750 unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
751                                                bool Collect) {
752   unsigned TotalStallCycles = 0;
753 
754   for (MachineBasicBlock &MBB : MF) {
755 
756     LLVM_DEBUG(if (Collect) {
757             if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
758             else dbgs() << MBB.getName(); dbgs() << ":\n";
759           });
760 
761     for (MachineInstr &MI : MBB.instrs()) {
762       if (MI.isBundle())
763           continue; // we analyze the instructions inside the bundle individually
764 
765       unsigned StallCycles;
766       unsigned UsedBanks;
767       std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
768 
769       if (Collect)
770         collectCandidates(MI, UsedBanks, StallCycles);
771 
772       TotalStallCycles += StallCycles;
773     }
774 
775     LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
776   }
777 
778   return TotalStallCycles;
779 }
780 
removeCandidates(Register Reg)781 void GCNRegBankReassign::removeCandidates(Register Reg) {
782   typename CandidateList::iterator Next;
783   for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
784     Next = std::next(I);
785     I->second.remove_if([Reg, this](const Candidate& C) {
786       return C.MI->readsRegister(Reg, TRI);
787     });
788     if (I->second.empty())
789       Candidates.erase(I);
790   }
791 }
792 
verifyCycles(MachineFunction & MF,unsigned OriginalCycles,unsigned CyclesSaved)793 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
794                                       unsigned OriginalCycles,
795                                       unsigned CyclesSaved) {
796   unsigned StallCycles = collectCandidates(MF, false);
797   LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
798                << " stall cycles left\n");
799   return StallCycles + CyclesSaved == OriginalCycles;
800 }
801 
runOnMachineFunction(MachineFunction & MF)802 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
803   ST = &MF.getSubtarget<GCNSubtarget>();
804   if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
805     return false;
806 
807   MRI = &MF.getRegInfo();
808   TRI = ST->getRegisterInfo();
809   MLI = &getAnalysis<MachineLoopInfo>();
810   VRM = &getAnalysis<VirtRegMap>();
811   LRM = &getAnalysis<LiveRegMatrix>();
812   LIS = &getAnalysis<LiveIntervals>();
813 
814   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
815   unsigned Occupancy = MFI->getOccupancy();
816   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
817   MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
818   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
819   MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
820 
821   CSRegs = MRI->getCalleeSavedRegs();
822   unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
823                          // Not a tight bound
824                          AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
825   RegsUsed.resize(NumRegBanks);
826 
827   LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
828                << '\n');
829 
830   unsigned StallCycles = collectCandidates(MF);
831   NumStallsDetected += StallCycles;
832 
833   LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
834                   "function " << MF.getName() << '\n');
835 
836   LLVM_DEBUG(Candidates.dump(this));
837 
838   unsigned CyclesSaved = 0;
839   while (!Candidates.empty()) {
840     Candidate C = Candidates.back();
841     unsigned LocalCyclesSaved = tryReassign(C);
842     CyclesSaved += LocalCyclesSaved;
843 
844     if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
845       report_fatal_error("RegBank reassign stall cycles verification failed.");
846 
847     Candidates.pop_back();
848     if (LocalCyclesSaved) {
849       removeCandidates(C.Reg);
850       computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
851 
852       LLVM_DEBUG(Candidates.dump(this));
853     }
854   }
855   NumStallsRecovered += CyclesSaved;
856 
857   LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
858                << " cycles saved in function " << MF.getName() << '\n');
859 
860   Candidates.clear();
861 
862   if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
863     report_fatal_error("RegBank reassign stall cycles verification failed.");
864 
865   RegsUsed.clear();
866 
867   return CyclesSaved > 0;
868 }
869