1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 /// The pass tries to use the 32-bit encoding for instructions when possible.
9 //===----------------------------------------------------------------------===//
10 //
11 
12 #include "AMDGPU.h"
13 #include "AMDGPUMCInstLower.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIInstrInfo.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "si-shrink-instructions"
28 
29 STATISTIC(NumInstructionsShrunk,
30           "Number of 64-bit instruction reduced to 32-bit.");
31 STATISTIC(NumLiteralConstantsFolded,
32           "Number of literal constants folded into 32-bit instructions.");
33 
34 namespace llvm {
35   void initializeSIShrinkInstructionsPass(PassRegistry&);
36 }
37 
38 using namespace llvm;
39 
40 namespace {
41 
42 class SIShrinkInstructions : public MachineFunctionPass {
43 public:
44   static char ID;
45 
46 public:
SIShrinkInstructions()47   SIShrinkInstructions() : MachineFunctionPass(ID) {
48   }
49 
50   bool runOnMachineFunction(MachineFunction &MF) override;
51 
getPassName() const52   const char *getPassName() const override {
53     return "SI Shrink Instructions";
54   }
55 
getAnalysisUsage(AnalysisUsage & AU) const56   void getAnalysisUsage(AnalysisUsage &AU) const override {
57     AU.setPreservesCFG();
58     MachineFunctionPass::getAnalysisUsage(AU);
59   }
60 };
61 
62 } // End anonymous namespace.
63 
64 INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
65                       "SI Lower il Copies", false, false)
66 INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
67                     "SI Lower il Copies", false, false)
68 
69 char SIShrinkInstructions::ID = 0;
70 
createSIShrinkInstructionsPass()71 FunctionPass *llvm::createSIShrinkInstructionsPass() {
72   return new SIShrinkInstructions();
73 }
74 
isVGPR(const MachineOperand * MO,const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI)75 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
76                    const MachineRegisterInfo &MRI) {
77   if (!MO->isReg())
78     return false;
79 
80   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
81     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
82 
83   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
84 }
85 
canShrink(MachineInstr & MI,const SIInstrInfo * TII,const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI)86 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
87                       const SIRegisterInfo &TRI,
88                       const MachineRegisterInfo &MRI) {
89 
90   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
91   // Can't shrink instruction with three operands.
92   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
93   // a special case for it.  It can only be shrunk if the third operand
94   // is vcc.  We should handle this the same way we handle vopc, by addding
95   // a register allocation hint pre-regalloc and then do the shrining
96   // post-regalloc.
97   if (Src2) {
98     switch (MI.getOpcode()) {
99       default: return false;
100 
101       case AMDGPU::V_MAC_F32_e64:
102         if (!isVGPR(Src2, TRI, MRI) ||
103             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
104           return false;
105         break;
106 
107       case AMDGPU::V_CNDMASK_B32_e64:
108         break;
109     }
110   }
111 
112   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
113   const MachineOperand *Src1Mod =
114       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
115 
116   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
117     return false;
118 
119   // We don't need to check src0, all input types are legal, so just make sure
120   // src0 isn't using any modifiers.
121   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
122     return false;
123 
124   // Check output modifiers
125   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
126     return false;
127 
128   if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
129     return false;
130 
131   return true;
132 }
133 
134 /// \brief This function checks \p MI for operands defined by a move immediate
135 /// instruction and then folds the literal constant into the instruction if it
136 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
137 /// and will only fold literal constants if we are still in SSA.
foldImmediates(MachineInstr & MI,const SIInstrInfo * TII,MachineRegisterInfo & MRI,bool TryToCommute=true)138 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
139                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
140 
141   if (!MRI.isSSA())
142     return;
143 
144   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
145 
146   const SIRegisterInfo &TRI = TII->getRegisterInfo();
147   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
148   MachineOperand &Src0 = MI.getOperand(Src0Idx);
149 
150   // Only one literal constant is allowed per instruction, so if src0 is a
151   // literal constant then we can't do any folding.
152   if (Src0.isImm() &&
153       TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
154     return;
155 
156   // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
157   // SGPR, we cannot commute the instruction, so we can't fold any literal
158   // constants.
159   if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
160     return;
161 
162   // Try to fold Src0
163   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
164     unsigned Reg = Src0.getReg();
165     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
166     if (Def && Def->isMoveImmediate()) {
167       MachineOperand &MovSrc = Def->getOperand(1);
168       bool ConstantFolded = false;
169 
170       if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
171         Src0.ChangeToImmediate(MovSrc.getImm());
172         ConstantFolded = true;
173       }
174       if (ConstantFolded) {
175         if (MRI.use_empty(Reg))
176           Def->eraseFromParent();
177         ++NumLiteralConstantsFolded;
178         return;
179       }
180     }
181   }
182 
183   // We have failed to fold src0, so commute the instruction and try again.
184   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
185     foldImmediates(MI, TII, MRI, false);
186 
187 }
188 
189 // Copy MachineOperand with all flags except setting it as implicit.
copyRegOperandAsImplicit(const MachineOperand & Orig)190 static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
191   assert(!Orig.isImplicit());
192   return MachineOperand::CreateReg(Orig.getReg(),
193                                    Orig.isDef(),
194                                    true,
195                                    Orig.isKill(),
196                                    Orig.isDead(),
197                                    Orig.isUndef(),
198                                    Orig.isEarlyClobber(),
199                                    Orig.getSubReg(),
200                                    Orig.isDebug(),
201                                    Orig.isInternalRead());
202 }
203 
runOnMachineFunction(MachineFunction & MF)204 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
205   MachineRegisterInfo &MRI = MF.getRegInfo();
206   const SIInstrInfo *TII =
207       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
208   const SIRegisterInfo &TRI = TII->getRegisterInfo();
209   std::vector<unsigned> I1Defs;
210 
211   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
212                                                   BI != BE; ++BI) {
213 
214     MachineBasicBlock &MBB = *BI;
215     MachineBasicBlock::iterator I, Next;
216     for (I = MBB.begin(); I != MBB.end(); I = Next) {
217       Next = std::next(I);
218       MachineInstr &MI = *I;
219 
220       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
221       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
222         const MachineOperand &Src = MI.getOperand(1);
223 
224         if (Src.isImm()) {
225           if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
226             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
227         }
228 
229         continue;
230       }
231 
232       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
233         continue;
234 
235       if (!canShrink(MI, TII, TRI, MRI)) {
236         // Try commuting the instruction and see if that enables us to shrink
237         // it.
238         if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
239             !canShrink(MI, TII, TRI, MRI))
240           continue;
241       }
242 
243       // getVOPe32 could be -1 here if we started with an instruction that had
244       // a 32-bit encoding and then commuted it to an instruction that did not.
245       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
246         continue;
247 
248       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
249 
250       if (TII->isVOPC(Op32)) {
251         unsigned DstReg = MI.getOperand(0).getReg();
252         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
253           // VOPC instructions can only write to the VCC register. We can't
254           // force them to use VCC here, because this is only one register and
255           // cannot deal with sequences which would require multiple copies of
256           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
257           //
258           // So, instead of forcing the instruction to write to VCC, we provide
259           // a hint to the register allocator to use VCC and then we we will run
260           // this pass again after RA and shrink it if it outputs to VCC.
261           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
262           continue;
263         }
264         if (DstReg != AMDGPU::VCC)
265           continue;
266       }
267 
268       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
269         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
270         // instructions.
271         const MachineOperand *Src2 =
272             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
273         if (!Src2->isReg())
274           continue;
275         unsigned SReg = Src2->getReg();
276         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
277           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
278           continue;
279         }
280         if (SReg != AMDGPU::VCC)
281           continue;
282       }
283 
284       // We can shrink this instruction
285       DEBUG(dbgs() << "Shrinking " << MI);
286 
287       MachineInstrBuilder Inst32 =
288           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
289 
290       // Add the dst operand if the 32-bit encoding also has an explicit $dst.
291       // For VOPC instructions, this is replaced by an implicit def of vcc.
292       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
293       if (Op32DstIdx != -1) {
294         // dst
295         Inst32.addOperand(MI.getOperand(0));
296       } else {
297         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
298                "Unexpected case");
299       }
300 
301 
302       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
303 
304       const MachineOperand *Src1 =
305           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
306       if (Src1)
307         Inst32.addOperand(*Src1);
308 
309       const MachineOperand *Src2 =
310         TII->getNamedOperand(MI, AMDGPU::OpName::src2);
311       if (Src2) {
312         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
313         if (Op32Src2Idx != -1) {
314           Inst32.addOperand(*Src2);
315         } else {
316           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
317           // replaced with an implicit read of vcc.
318           assert(Src2->getReg() == AMDGPU::VCC &&
319                  "Unexpected missing register operand");
320           Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
321         }
322       }
323 
324       ++NumInstructionsShrunk;
325       MI.eraseFromParent();
326 
327       foldImmediates(*Inst32, TII, MRI);
328       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
329 
330 
331     }
332   }
333   return false;
334 }
335