1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //==-----------------------------------------------------------------------===//
9 
10 #include "AMDILDevices.h"
11 #include "AMDGPUInstrInfo.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringRef.h"
15 #include "llvm/ADT/Twine.h"
16 #include "llvm/Constants.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
19 #include "llvm/Function.h"
20 #include "llvm/Instructions.h"
21 #include "llvm/Module.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/MathExtras.h"
24 
25 #include <sstream>
26 
27 #if 0
28 STATISTIC(PointerAssignments, "Number of dynamic pointer "
29     "assigments discovered");
30 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
31 #endif
32 
33 using namespace llvm;
34 // The Peephole optimization pass is used to do simple last minute optimizations
35 // that are required for correct code or to remove redundant functions
36 namespace {
37 
38 class OpaqueType;
39 
40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
41 public:
42   TargetMachine &TM;
43   static char ID;
44   AMDGPUPeepholeOpt(TargetMachine &tm);
45   ~AMDGPUPeepholeOpt();
46   const char *getPassName() const;
47   bool runOnFunction(Function &F);
48   bool doInitialization(Module &M);
49   bool doFinalization(Module &M);
50   void getAnalysisUsage(AnalysisUsage &AU) const;
51 protected:
52 private:
53   // Function to initiate all of the instruction level optimizations.
54   bool instLevelOptimizations(BasicBlock::iterator *inst);
55   // Quick check to see if we need to dump all of the pointers into the
56   // arena. If this is correct, then we set all pointers to exist in arena. This
57   // is a workaround for aliasing of pointers in a struct/union.
58   bool dumpAllIntoArena(Function &F);
59   // Because I don't want to invalidate any pointers while in the
60   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
61   // it later. This function does the conversions if required.
62   void doAtomicConversionIfNeeded(Function &F);
63   // Because __amdil_is_constant cannot be properly evaluated if
64   // optimizations are disabled, the call's are placed in a vector
65   // and evaluated after the __amdil_image* functions are evaluated
66   // which should allow the __amdil_is_constant function to be
67   // evaluated correctly.
68   void doIsConstCallConversionIfNeeded();
69   bool mChanged;
70   bool mDebug;
71   bool mConvertAtomics;
72   CodeGenOpt::Level optLevel;
73   // Run a series of tests to see if we can optimize a CALL instruction.
74   bool optimizeCallInst(BasicBlock::iterator *bbb);
75   // A peephole optimization to optimize bit extract sequences.
76   bool optimizeBitExtract(Instruction *inst);
77   // A peephole optimization to optimize bit insert sequences.
78   bool optimizeBitInsert(Instruction *inst);
79   bool setupBitInsert(Instruction *base,
80                       Instruction *&src,
81                       Constant *&mask,
82                       Constant *&shift);
83   // Expand the bit field insert instruction on versions of OpenCL that
84   // don't support it.
85   bool expandBFI(CallInst *CI);
86   // Expand the bit field mask instruction on version of OpenCL that
87   // don't support it.
88   bool expandBFM(CallInst *CI);
89   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
90   // this case we need to expand them. These functions check for 24bit functions
91   // and then expand.
92   bool isSigned24BitOps(CallInst *CI);
93   void expandSigned24BitOps(CallInst *CI);
94   // One optimization that can occur is that if the required workgroup size is
95   // specified then the result of get_local_size is known at compile time and
96   // can be returned accordingly.
97   bool isRWGLocalOpt(CallInst *CI);
98   // On northern island cards, the division is slightly less accurate than on
99   // previous generations, so we need to utilize a more accurate division. So we
100   // can translate the accurate divide to a normal divide on all other cards.
101   bool convertAccurateDivide(CallInst *CI);
102   void expandAccurateDivide(CallInst *CI);
103   // If the alignment is set incorrectly, it can produce really inefficient
104   // code. This checks for this scenario and fixes it if possible.
105   bool correctMisalignedMemOp(Instruction *inst);
106 
107   // If we are in no opt mode, then we need to make sure that
108   // local samplers are properly propagated as constant propagation
109   // doesn't occur and we need to know the value of kernel defined
110   // samplers at compile time.
111   bool propagateSamplerInst(CallInst *CI);
112 
113   // Helper functions
114 
115   // Group of functions that recursively calculate the size of a structure based
116   // on it's sub-types.
117   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
118   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
119   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
120   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
121   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
122   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
123   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
124   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
125 
126   LLVMContext *mCTX;
127   Function *mF;
128   const AMDGPUSubtarget *mSTM;
129   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
130   SmallVector<CallInst *, 16> isConstVec;
131 }; // class AMDGPUPeepholeOpt
132   char AMDGPUPeepholeOpt::ID = 0;
133 
134 // A template function that has two levels of looping before calling the
135 // function with a pointer to the current iterator.
136 template<class InputIterator, class SecondIterator, class Function>
safeNestedForEach(InputIterator First,InputIterator Last,SecondIterator S,Function F)137 Function safeNestedForEach(InputIterator First, InputIterator Last,
138                               SecondIterator S, Function F)
139 {
140   for ( ; First != Last; ++First) {
141     SecondIterator sf, sl;
142     for (sf = First->begin(), sl = First->end();
143          sf != sl; )  {
144       if (!F(&sf)) {
145         ++sf;
146       }
147     }
148   }
149   return F;
150 }
151 
152 } // anonymous namespace
153 
154 namespace llvm {
155   FunctionPass *
createAMDGPUPeepholeOpt(TargetMachine & tm)156   createAMDGPUPeepholeOpt(TargetMachine &tm)
157   {
158     return new AMDGPUPeepholeOpt(tm);
159   }
160 } // llvm namespace
161 
AMDGPUPeepholeOpt(TargetMachine & tm)162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
163   : FunctionPass(ID), TM(tm)
164 {
165   mDebug = false;
166   optLevel = TM.getOptLevel();
167 
168 }
169 
~AMDGPUPeepholeOpt()170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
171 {
172 }
173 
174 const char *
getPassName() const175 AMDGPUPeepholeOpt::getPassName() const
176 {
177   return "AMDGPU PeepHole Optimization Pass";
178 }
179 
180 bool
containsPointerType(Type * Ty)181 containsPointerType(Type *Ty)
182 {
183   if (!Ty) {
184     return false;
185   }
186   switch(Ty->getTypeID()) {
187   default:
188     return false;
189   case Type::StructTyID: {
190     const StructType *ST = dyn_cast<StructType>(Ty);
191     for (StructType::element_iterator stb = ST->element_begin(),
192            ste = ST->element_end(); stb != ste; ++stb) {
193       if (!containsPointerType(*stb)) {
194         continue;
195       }
196       return true;
197     }
198     break;
199   }
200   case Type::VectorTyID:
201   case Type::ArrayTyID:
202     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
203   case Type::PointerTyID:
204     return true;
205   };
206   return false;
207 }
208 
209 bool
dumpAllIntoArena(Function & F)210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)
211 {
212   bool dumpAll = false;
213   for (Function::const_arg_iterator cab = F.arg_begin(),
214        cae = F.arg_end(); cab != cae; ++cab) {
215     const Argument *arg = cab;
216     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
217     if (!PT) {
218       continue;
219     }
220     Type *DereferencedType = PT->getElementType();
221     if (!dyn_cast<StructType>(DereferencedType)
222         ) {
223       continue;
224     }
225     if (!containsPointerType(DereferencedType)) {
226       continue;
227     }
228     // FIXME: Because a pointer inside of a struct/union may be aliased to
229     // another pointer we need to take the conservative approach and place all
230     // pointers into the arena until more advanced detection is implemented.
231     dumpAll = true;
232   }
233   return dumpAll;
234 }
235 void
doIsConstCallConversionIfNeeded()236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
237 {
238   if (isConstVec.empty()) {
239     return;
240   }
241   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
242     CallInst *CI = isConstVec[x];
243     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
244     Type *aType = Type::getInt32Ty(*mCTX);
245     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
246       : ConstantInt::get(aType, 0);
247     CI->replaceAllUsesWith(Val);
248     CI->eraseFromParent();
249   }
250   isConstVec.clear();
251 }
252 void
doAtomicConversionIfNeeded(Function & F)253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
254 {
255   // Don't do anything if we don't have any atomic operations.
256   if (atomicFuncs.empty()) {
257     return;
258   }
259   // Change the function name for the atomic if it is required
260   uint32_t size = atomicFuncs.size();
261   for (uint32_t x = 0; x < size; ++x) {
262     atomicFuncs[x].first->setOperand(
263         atomicFuncs[x].first->getNumOperands()-1,
264         atomicFuncs[x].second);
265 
266   }
267   mChanged = true;
268   if (mConvertAtomics) {
269     return;
270   }
271 }
272 
273 bool
runOnFunction(Function & MF)274 AMDGPUPeepholeOpt::runOnFunction(Function &MF)
275 {
276   mChanged = false;
277   mF = &MF;
278   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
279   if (mDebug) {
280     MF.dump();
281   }
282   mCTX = &MF.getType()->getContext();
283   mConvertAtomics = true;
284   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
285      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
286                   this));
287 
288   doAtomicConversionIfNeeded(MF);
289   doIsConstCallConversionIfNeeded();
290 
291   if (mDebug) {
292     MF.dump();
293   }
294   return mChanged;
295 }
296 
297 bool
optimizeCallInst(BasicBlock::iterator * bbb)298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
299 {
300   Instruction *inst = (*bbb);
301   CallInst *CI = dyn_cast<CallInst>(inst);
302   if (!CI) {
303     return false;
304   }
305   if (isSigned24BitOps(CI)) {
306     expandSigned24BitOps(CI);
307     ++(*bbb);
308     CI->eraseFromParent();
309     return true;
310   }
311   if (propagateSamplerInst(CI)) {
312     return false;
313   }
314   if (expandBFI(CI) || expandBFM(CI)) {
315     ++(*bbb);
316     CI->eraseFromParent();
317     return true;
318   }
319   if (convertAccurateDivide(CI)) {
320     expandAccurateDivide(CI);
321     ++(*bbb);
322     CI->eraseFromParent();
323     return true;
324   }
325 
326   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
327   if (calleeName.startswith("__amdil_is_constant")) {
328     // If we do not have optimizations, then this
329     // cannot be properly evaluated, so we add the
330     // call instruction to a vector and process
331     // them at the end of processing after the
332     // samplers have been correctly handled.
333     if (optLevel == CodeGenOpt::None) {
334       isConstVec.push_back(CI);
335       return false;
336     } else {
337       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
338       Type *aType = Type::getInt32Ty(*mCTX);
339       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
340         : ConstantInt::get(aType, 0);
341       CI->replaceAllUsesWith(Val);
342       ++(*bbb);
343       CI->eraseFromParent();
344       return true;
345     }
346   }
347 
348   if (calleeName.equals("__amdil_is_asic_id_i32")) {
349     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
350     Type *aType = Type::getInt32Ty(*mCTX);
351     Value *Val = CV;
352     if (Val) {
353       Val = ConstantInt::get(aType,
354           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
355     } else {
356       Val = ConstantInt::get(aType, 0);
357     }
358     CI->replaceAllUsesWith(Val);
359     ++(*bbb);
360     CI->eraseFromParent();
361     return true;
362   }
363   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
364   if (!F) {
365     return false;
366   }
367   if (F->getName().startswith("__atom") && !CI->getNumUses()
368       && F->getName().find("_xchg") == StringRef::npos) {
369     std::string buffer(F->getName().str() + "_noret");
370     F = dyn_cast<Function>(
371           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
372     atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
373   }
374 
375   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
376       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
377     return false;
378   }
379   if (!mConvertAtomics) {
380     return false;
381   }
382   StringRef name = F->getName();
383   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
384     mConvertAtomics = false;
385   }
386   return false;
387 }
388 
389 bool
setupBitInsert(Instruction * base,Instruction * & src,Constant * & mask,Constant * & shift)390 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
391     Instruction *&src,
392     Constant *&mask,
393     Constant *&shift)
394 {
395   if (!base) {
396     if (mDebug) {
397       dbgs() << "Null pointer passed into function.\n";
398     }
399     return false;
400   }
401   bool andOp = false;
402   if (base->getOpcode() == Instruction::Shl) {
403     shift = dyn_cast<Constant>(base->getOperand(1));
404   } else if (base->getOpcode() == Instruction::And) {
405     mask = dyn_cast<Constant>(base->getOperand(1));
406     andOp = true;
407   } else {
408     if (mDebug) {
409       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
410     }
411     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
412     return false;
413   }
414   src = dyn_cast<Instruction>(base->getOperand(0));
415   if (!src) {
416     if (mDebug) {
417       dbgs() << "Failed setup since the base operand is not an instruction!\n";
418     }
419     return false;
420   }
421   // If we find an 'and' operation, then we don't need to
422   // find the next operation as we already know the
423   // bits that are valid at this point.
424   if (andOp) {
425     return true;
426   }
427   if (src->getOpcode() == Instruction::Shl && !shift) {
428     shift = dyn_cast<Constant>(src->getOperand(1));
429     src = dyn_cast<Instruction>(src->getOperand(0));
430   } else if (src->getOpcode() == Instruction::And && !mask) {
431     mask = dyn_cast<Constant>(src->getOperand(1));
432   }
433   if (!mask && !shift) {
434     if (mDebug) {
435       dbgs() << "Failed setup since both mask and shift are NULL!\n";
436     }
437     // Did not find a constant mask or a shift.
438     return false;
439   }
440   return true;
441 }
442 bool
optimizeBitInsert(Instruction * inst)443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)
444 {
445   if (!inst) {
446     return false;
447   }
448   if (!inst->isBinaryOp()) {
449     return false;
450   }
451   if (inst->getOpcode() != Instruction::Or) {
452     return false;
453   }
454   if (optLevel == CodeGenOpt::None) {
455     return false;
456   }
457   // We want to do an optimization on a sequence of ops that in the end equals a
458   // single ISA instruction.
459   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
460   // Some simplified versions of this pattern are as follows:
461   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
462   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
463   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
464   // (A & B) | (D << F) when (1 << F) >= B
465   // (A << C) | (D & E) when (1 << C) >= E
466   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
467     // The HD4XXX hardware doesn't support the ubit_insert instruction.
468     return false;
469   }
470   Type *aType = inst->getType();
471   bool isVector = aType->isVectorTy();
472   int numEle = 1;
473   // This optimization only works on 32bit integers.
474   if (aType->getScalarType()
475       != Type::getInt32Ty(inst->getContext())) {
476     return false;
477   }
478   if (isVector) {
479     const VectorType *VT = dyn_cast<VectorType>(aType);
480     numEle = VT->getNumElements();
481     // We currently cannot support more than 4 elements in a intrinsic and we
482     // cannot support Vec3 types.
483     if (numEle > 4 || numEle == 3) {
484       return false;
485     }
486   }
487   // TODO: Handle vectors.
488   if (isVector) {
489     if (mDebug) {
490       dbgs() << "!!! Vectors are not supported yet!\n";
491     }
492     return false;
493   }
494   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
495   Constant *LHSMask = NULL, *RHSMask = NULL;
496   Constant *LHSShift = NULL, *RHSShift = NULL;
497   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
498   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
499   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
500     if (mDebug) {
501       dbgs() << "Found an OR Operation that failed setup!\n";
502       inst->dump();
503       if (LHS) { LHS->dump(); }
504       if (LHSSrc) { LHSSrc->dump(); }
505       if (LHSMask) { LHSMask->dump(); }
506       if (LHSShift) { LHSShift->dump(); }
507     }
508     // There was an issue with the setup for BitInsert.
509     return false;
510   }
511   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
512     if (mDebug) {
513       dbgs() << "Found an OR Operation that failed setup!\n";
514       inst->dump();
515       if (RHS) { RHS->dump(); }
516       if (RHSSrc) { RHSSrc->dump(); }
517       if (RHSMask) { RHSMask->dump(); }
518       if (RHSShift) { RHSShift->dump(); }
519     }
520     // There was an issue with the setup for BitInsert.
521     return false;
522   }
523   if (mDebug) {
524     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
525     dbgs() << "Op:        "; inst->dump();
526     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
527     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
528     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
529     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
530     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
531     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
532     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
533     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
534   }
535   Constant *offset = NULL;
536   Constant *width = NULL;
537   int32_t lhsMaskVal = 0, rhsMaskVal = 0;
538   int32_t lhsShiftVal = 0, rhsShiftVal = 0;
539   int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
540   int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
541   lhsMaskVal = (int32_t)(LHSMask
542       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
543   rhsMaskVal = (int32_t)(RHSMask
544       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
545   lhsShiftVal = (int32_t)(LHSShift
546       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
547   rhsShiftVal = (int32_t)(RHSShift
548       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
549   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
550   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
551   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
552   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
553   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
554   if (mDebug) {
555       dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
556       dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
557       dbgs() << (RHSMask ? " & E)" : ")");
558       dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
559       dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
560       dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
561       dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
562       dbgs() << "width(B) = " << lhsMaskWidth;
563       dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
564       dbgs() << "offset(B) = " << lhsMaskOffset;
565       dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
566       dbgs() << "Constraints: \n";
567       dbgs() << "\t(1) B ^ E == 0\n";
568       dbgs() << "\t(2-LHS) B is a mask\n";
569       dbgs() << "\t(2-LHS) E is a mask\n";
570       dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
571       dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
572   }
573   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
574     if (mDebug) {
575       dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
576       dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
577       dbgs() << "Failed constraint 1!\n";
578     }
579     return false;
580   }
581   if (mDebug) {
582     dbgs() << "LHS = " << lhsMaskOffset << "";
583     dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
584     dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
585     dbgs() << "\nRHS = " << rhsMaskOffset << "";
586     dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
587     dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
588     dbgs() << "\n";
589   }
590   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
591     offset = ConstantInt::get(aType, lhsMaskOffset, false);
592     width = ConstantInt::get(aType, lhsMaskWidth, false);
593     RHSSrc = RHS;
594     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
595       if (mDebug) {
596         dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
597         dbgs() << "Failed constraint 2!\n";
598       }
599       return false;
600     }
601     if (!LHSShift) {
602       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
603           "MaskShr", LHS);
604     } else if (lhsShiftVal != lhsMaskOffset) {
605       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
606           "MaskShr", LHS);
607     }
608     if (mDebug) {
609       dbgs() << "Optimizing LHS!\n";
610     }
611   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
612     offset = ConstantInt::get(aType, rhsMaskOffset, false);
613     width = ConstantInt::get(aType, rhsMaskWidth, false);
614     LHSSrc = RHSSrc;
615     RHSSrc = LHS;
616     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
617       if (mDebug) {
618         dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
619         dbgs() << "Failed constraint 2!\n";
620       }
621       return false;
622     }
623     if (!RHSShift) {
624       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
625           "MaskShr", RHS);
626     } else if (rhsShiftVal != rhsMaskOffset) {
627       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
628           "MaskShr", RHS);
629     }
630     if (mDebug) {
631       dbgs() << "Optimizing RHS!\n";
632     }
633   } else {
634     if (mDebug) {
635       dbgs() << "Failed constraint 3!\n";
636     }
637     return false;
638   }
639   if (mDebug) {
640     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
641     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
642     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
643     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
644   }
645   if (!offset || !width) {
646     if (mDebug) {
647       dbgs() << "Either width or offset are NULL, failed detection!\n";
648     }
649     return false;
650   }
651   // Lets create the function signature.
652   std::vector<Type *> callTypes;
653   callTypes.push_back(aType);
654   callTypes.push_back(aType);
655   callTypes.push_back(aType);
656   callTypes.push_back(aType);
657   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
658   std::string name = "__amdil_ubit_insert";
659   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
660   Function *Func =
661     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
662         getOrInsertFunction(llvm::StringRef(name), funcType));
663   Value *Operands[4] = {
664     width,
665     offset,
666     LHSSrc,
667     RHSSrc
668   };
669   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
670   if (mDebug) {
671     dbgs() << "Old Inst: ";
672     inst->dump();
673     dbgs() << "New Inst: ";
674     CI->dump();
675     dbgs() << "\n\n";
676   }
677   CI->insertBefore(inst);
678   inst->replaceAllUsesWith(CI);
679   return true;
680 }
681 
682 bool
optimizeBitExtract(Instruction * inst)683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)
684 {
685   if (!inst) {
686     return false;
687   }
688   if (!inst->isBinaryOp()) {
689     return false;
690   }
691   if (inst->getOpcode() != Instruction::And) {
692     return false;
693   }
694   if (optLevel == CodeGenOpt::None) {
695     return false;
696   }
697   // We want to do some simple optimizations on Shift right/And patterns. The
698   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
699   // value smaller than 32 and C is a mask. If C is a constant value, then the
700   // following transformation can occur. For signed integers, it turns into the
701   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
702   // integers, it turns into the function call dst =
703   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
704   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
705   // Evergreen hardware.
706   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
707     // This does not work on HD4XXX hardware.
708     return false;
709   }
710   Type *aType = inst->getType();
711   bool isVector = aType->isVectorTy();
712 
713   // XXX Support vector types
714   if (isVector) {
715     return false;
716   }
717   int numEle = 1;
718   // This only works on 32bit integers
719   if (aType->getScalarType()
720       != Type::getInt32Ty(inst->getContext())) {
721     return false;
722   }
723   if (isVector) {
724     const VectorType *VT = dyn_cast<VectorType>(aType);
725     numEle = VT->getNumElements();
726     // We currently cannot support more than 4 elements in a intrinsic and we
727     // cannot support Vec3 types.
728     if (numEle > 4 || numEle == 3) {
729       return false;
730     }
731   }
732   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
733   // If the first operand is not a shift instruction, then we can return as it
734   // doesn't match this pattern.
735   if (!ShiftInst || !ShiftInst->isShift()) {
736     return false;
737   }
738   // If we are a shift left, then we need don't match this pattern.
739   if (ShiftInst->getOpcode() == Instruction::Shl) {
740     return false;
741   }
742   bool isSigned = ShiftInst->isArithmeticShift();
743   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
744   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
745   // Lets make sure that the shift value and the and mask are constant integers.
746   if (!AndMask || !ShrVal) {
747     return false;
748   }
749   Constant *newMaskConst;
750   Constant *shiftValConst;
751   if (isVector) {
752     // Handle the vector case
753     std::vector<Constant *> maskVals;
754     std::vector<Constant *> shiftVals;
755     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
756     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
757     Type *scalarType = AndMaskVec->getType()->getScalarType();
758     assert(AndMaskVec->getNumOperands() ==
759            ShrValVec->getNumOperands() && "cannot have a "
760            "combination where the number of elements to a "
761            "shift and an and are different!");
762     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
763       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
764       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
765       if (!AndCI || !ShiftIC) {
766         return false;
767       }
768       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
769       if (!isMask_32(maskVal)) {
770         return false;
771       }
772       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
773       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
774       // If the mask or shiftval is greater than the bitcount, then break out.
775       if (maskVal >= 32 || shiftVal >= 32) {
776         return false;
777       }
778       // If the mask val is greater than the the number of original bits left
779       // then this optimization is invalid.
780       if (maskVal > (32 - shiftVal)) {
781         return false;
782       }
783       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
784       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
785     }
786     newMaskConst = ConstantVector::get(maskVals);
787     shiftValConst = ConstantVector::get(shiftVals);
788   } else {
789     // Handle the scalar case
790     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
791     // This must be a mask value where all lower bits are set to 1 and then any
792     // bit higher is set to 0.
793     if (!isMask_32(maskVal)) {
794       return false;
795     }
796     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
797     // Count the number of bits set in the mask, this is the width of the
798     // resulting bit set that is extracted from the source value.
799     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
800     // If the mask or shift val is greater than the bitcount, then break out.
801     if (maskVal >= 32 || shiftVal >= 32) {
802       return false;
803     }
804     // If the mask val is greater than the the number of original bits left then
805     // this optimization is invalid.
806     if (maskVal > (32 - shiftVal)) {
807       return false;
808     }
809     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
810     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
811   }
812   // Lets create the function signature.
813   std::vector<Type *> callTypes;
814   callTypes.push_back(aType);
815   callTypes.push_back(aType);
816   callTypes.push_back(aType);
817   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
818   std::string name = "llvm.AMDIL.bit.extract.u32";
819   if (isVector) {
820     name += ".v" + itostr(numEle) + "i32";
821   } else {
822     name += ".";
823   }
824   // Lets create the function.
825   Function *Func =
826     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
827                        getOrInsertFunction(llvm::StringRef(name), funcType));
828   Value *Operands[3] = {
829     ShiftInst->getOperand(0),
830     shiftValConst,
831     newMaskConst
832   };
833   // Lets create the Call with the operands
834   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
835   CI->setDoesNotAccessMemory();
836   CI->insertBefore(inst);
837   inst->replaceAllUsesWith(CI);
838   return true;
839 }
840 
841 bool
expandBFI(CallInst * CI)842 AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
843 {
844   if (!CI) {
845     return false;
846   }
847   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
848   if (!LHS->getName().startswith("__amdil_bfi")) {
849     return false;
850   }
851   Type* type = CI->getOperand(0)->getType();
852   Constant *negOneConst = NULL;
853   if (type->isVectorTy()) {
854     std::vector<Constant *> negOneVals;
855     negOneConst = ConstantInt::get(CI->getContext(),
856         APInt(32, StringRef("-1"), 10));
857     for (size_t x = 0,
858         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
859       negOneVals.push_back(negOneConst);
860     }
861     negOneConst = ConstantVector::get(negOneVals);
862   } else {
863     negOneConst = ConstantInt::get(CI->getContext(),
864         APInt(32, StringRef("-1"), 10));
865   }
866   // __amdil_bfi => (A & B) | (~A & C)
867   BinaryOperator *lhs =
868     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
869         CI->getOperand(1), "bfi_and", CI);
870   BinaryOperator *rhs =
871     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
872         "bfi_not", CI);
873   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
874       "bfi_and", CI);
875   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
876   CI->replaceAllUsesWith(lhs);
877   return true;
878 }
879 
880 bool
expandBFM(CallInst * CI)881 AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
882 {
883   if (!CI) {
884     return false;
885   }
886   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
887   if (!LHS->getName().startswith("__amdil_bfm")) {
888     return false;
889   }
890   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
891   Constant *newMaskConst = NULL;
892   Constant *newShiftConst = NULL;
893   Type* type = CI->getOperand(0)->getType();
894   if (type->isVectorTy()) {
895     std::vector<Constant*> newMaskVals, newShiftVals;
896     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
897     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
898     for (size_t x = 0,
899         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
900       newMaskVals.push_back(newMaskConst);
901       newShiftVals.push_back(newShiftConst);
902     }
903     newMaskConst = ConstantVector::get(newMaskVals);
904     newShiftConst = ConstantVector::get(newShiftVals);
905   } else {
906     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
907     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
908   }
909   BinaryOperator *lhs =
910     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
911         newMaskConst, "bfm_mask", CI);
912   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
913       lhs, "bfm_shl", CI);
914   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
915       newShiftConst, "bfm_sub", CI);
916   BinaryOperator *rhs =
917     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
918         newMaskConst, "bfm_mask", CI);
919   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
920   CI->replaceAllUsesWith(lhs);
921   return true;
922 }
923 
924 bool
instLevelOptimizations(BasicBlock::iterator * bbb)925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
926 {
927   Instruction *inst = (*bbb);
928   if (optimizeCallInst(bbb)) {
929     return true;
930   }
931   if (optimizeBitExtract(inst)) {
932     return false;
933   }
934   if (optimizeBitInsert(inst)) {
935     return false;
936   }
937   if (correctMisalignedMemOp(inst)) {
938     return false;
939   }
940   return false;
941 }
942 bool
correctMisalignedMemOp(Instruction * inst)943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
944 {
945   LoadInst *linst = dyn_cast<LoadInst>(inst);
946   StoreInst *sinst = dyn_cast<StoreInst>(inst);
947   unsigned alignment;
948   Type* Ty = inst->getType();
949   if (linst) {
950     alignment = linst->getAlignment();
951     Ty = inst->getType();
952   } else if (sinst) {
953     alignment = sinst->getAlignment();
954     Ty = sinst->getValueOperand()->getType();
955   } else {
956     return false;
957   }
958   unsigned size = getTypeSize(Ty);
959   if (size == alignment || size < alignment) {
960     return false;
961   }
962   if (!Ty->isStructTy()) {
963     return false;
964   }
965   if (alignment < 4) {
966     if (linst) {
967       linst->setAlignment(0);
968       return true;
969     } else if (sinst) {
970       sinst->setAlignment(0);
971       return true;
972     }
973   }
974   return false;
975 }
976 bool
isSigned24BitOps(CallInst * CI)977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)
978 {
979   if (!CI) {
980     return false;
981   }
982   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
983   std::string namePrefix = LHS->getName().substr(0, 14);
984   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
985       && namePrefix != "__amdil__imul24_high") {
986     return false;
987   }
988   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
989     return false;
990   }
991   return true;
992 }
993 
994 void
expandSigned24BitOps(CallInst * CI)995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)
996 {
997   assert(isSigned24BitOps(CI) && "Must be a "
998       "signed 24 bit operation to call this function!");
999   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1000   // On 7XX and 8XX we do not have signed 24bit, so we need to
1001   // expand it to the following:
1002   // imul24 turns into 32bit imul
1003   // imad24 turns into 32bit imad
1004   // imul24_high turns into 32bit imulhigh
1005   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1006     Type *aType = CI->getOperand(0)->getType();
1007     bool isVector = aType->isVectorTy();
1008     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1009     std::vector<Type*> callTypes;
1010     callTypes.push_back(CI->getOperand(0)->getType());
1011     callTypes.push_back(CI->getOperand(1)->getType());
1012     callTypes.push_back(CI->getOperand(2)->getType());
1013     FunctionType *funcType =
1014       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1015     std::string name = "__amdil_imad";
1016     if (isVector) {
1017       name += "_v" + itostr(numEle) + "i32";
1018     } else {
1019       name += "_i32";
1020     }
1021     Function *Func = dyn_cast<Function>(
1022                        CI->getParent()->getParent()->getParent()->
1023                        getOrInsertFunction(llvm::StringRef(name), funcType));
1024     Value *Operands[3] = {
1025       CI->getOperand(0),
1026       CI->getOperand(1),
1027       CI->getOperand(2)
1028     };
1029     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1030     nCI->insertBefore(CI);
1031     CI->replaceAllUsesWith(nCI);
1032   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1033     BinaryOperator *mulOp =
1034       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1035           CI->getOperand(1), "imul24", CI);
1036     CI->replaceAllUsesWith(mulOp);
1037   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1038     Type *aType = CI->getOperand(0)->getType();
1039 
1040     bool isVector = aType->isVectorTy();
1041     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1042     std::vector<Type*> callTypes;
1043     callTypes.push_back(CI->getOperand(0)->getType());
1044     callTypes.push_back(CI->getOperand(1)->getType());
1045     FunctionType *funcType =
1046       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1047     std::string name = "__amdil_imul_high";
1048     if (isVector) {
1049       name += "_v" + itostr(numEle) + "i32";
1050     } else {
1051       name += "_i32";
1052     }
1053     Function *Func = dyn_cast<Function>(
1054                        CI->getParent()->getParent()->getParent()->
1055                        getOrInsertFunction(llvm::StringRef(name), funcType));
1056     Value *Operands[2] = {
1057       CI->getOperand(0),
1058       CI->getOperand(1)
1059     };
1060     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1061     nCI->insertBefore(CI);
1062     CI->replaceAllUsesWith(nCI);
1063   }
1064 }
1065 
1066 bool
isRWGLocalOpt(CallInst * CI)1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1068 {
1069   return (CI != NULL
1070           && CI->getOperand(CI->getNumOperands() - 1)->getName()
1071           == "__amdil_get_local_size_int");
1072 }
1073 
1074 bool
convertAccurateDivide(CallInst * CI)1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)
1076 {
1077   if (!CI) {
1078     return false;
1079   }
1080   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1081       && (mSTM->getDeviceName() == "cayman")) {
1082     return false;
1083   }
1084   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1085       == "__amdil_improved_div";
1086 }
1087 
1088 void
expandAccurateDivide(CallInst * CI)1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)
1090 {
1091   assert(convertAccurateDivide(CI)
1092          && "expanding accurate divide can only happen if it is expandable!");
1093   BinaryOperator *divOp =
1094     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1095                            CI->getOperand(1), "fdiv32", CI);
1096   CI->replaceAllUsesWith(divOp);
1097 }
1098 
1099 bool
propagateSamplerInst(CallInst * CI)1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
1101 {
1102   if (optLevel != CodeGenOpt::None) {
1103     return false;
1104   }
1105 
1106   if (!CI) {
1107     return false;
1108   }
1109 
1110   unsigned funcNameIdx = 0;
1111   funcNameIdx = CI->getNumOperands() - 1;
1112   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1113   if (calleeName != "__amdil_image2d_read_norm"
1114    && calleeName != "__amdil_image2d_read_unnorm"
1115    && calleeName != "__amdil_image3d_read_norm"
1116    && calleeName != "__amdil_image3d_read_unnorm") {
1117     return false;
1118   }
1119 
1120   unsigned samplerIdx = 2;
1121   samplerIdx = 1;
1122   Value *sampler = CI->getOperand(samplerIdx);
1123   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1124   if (!lInst) {
1125     return false;
1126   }
1127 
1128   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1129     return false;
1130   }
1131 
1132   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1133   // If we are loading from what is not a global value, then we
1134   // fail and return.
1135   if (!gv) {
1136     return false;
1137   }
1138 
1139   // If we don't have an initializer or we have an initializer and
1140   // the initializer is not a 32bit integer, we fail.
1141   if (!gv->hasInitializer()
1142       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1143       return false;
1144   }
1145 
1146   // Now that we have the global variable initializer, lets replace
1147   // all uses of the load instruction with the samplerVal and
1148   // reparse the __amdil_is_constant() function.
1149   Constant *samplerVal = gv->getInitializer();
1150   lInst->replaceAllUsesWith(samplerVal);
1151   return true;
1152 }
1153 
1154 bool
doInitialization(Module & M)1155 AMDGPUPeepholeOpt::doInitialization(Module &M)
1156 {
1157   return false;
1158 }
1159 
1160 bool
doFinalization(Module & M)1161 AMDGPUPeepholeOpt::doFinalization(Module &M)
1162 {
1163   return false;
1164 }
1165 
1166 void
getAnalysisUsage(AnalysisUsage & AU) const1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1168 {
1169   AU.addRequired<MachineFunctionAnalysis>();
1170   FunctionPass::getAnalysisUsage(AU);
1171   AU.setPreservesAll();
1172 }
1173 
getTypeSize(Type * const T,bool dereferencePtr)1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1175   size_t size = 0;
1176   if (!T) {
1177     return size;
1178   }
1179   switch (T->getTypeID()) {
1180   case Type::X86_FP80TyID:
1181   case Type::FP128TyID:
1182   case Type::PPC_FP128TyID:
1183   case Type::LabelTyID:
1184     assert(0 && "These types are not supported by this backend");
1185   default:
1186   case Type::FloatTyID:
1187   case Type::DoubleTyID:
1188     size = T->getPrimitiveSizeInBits() >> 3;
1189     break;
1190   case Type::PointerTyID:
1191     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1192     break;
1193   case Type::IntegerTyID:
1194     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1195     break;
1196   case Type::StructTyID:
1197     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1198     break;
1199   case Type::ArrayTyID:
1200     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1201     break;
1202   case Type::FunctionTyID:
1203     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1204     break;
1205   case Type::VectorTyID:
1206     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1207     break;
1208   };
1209   return size;
1210 }
1211 
getTypeSize(StructType * const ST,bool dereferencePtr)1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1213     bool dereferencePtr) {
1214   size_t size = 0;
1215   if (!ST) {
1216     return size;
1217   }
1218   Type *curType;
1219   StructType::element_iterator eib;
1220   StructType::element_iterator eie;
1221   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1222     curType = *eib;
1223     size += getTypeSize(curType, dereferencePtr);
1224   }
1225   return size;
1226 }
1227 
getTypeSize(IntegerType * const IT,bool dereferencePtr)1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1229     bool dereferencePtr) {
1230   return IT ? (IT->getBitWidth() >> 3) : 0;
1231 }
1232 
getTypeSize(FunctionType * const FT,bool dereferencePtr)1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1234     bool dereferencePtr) {
1235     assert(0 && "Should not be able to calculate the size of an function type");
1236     return 0;
1237 }
1238 
getTypeSize(ArrayType * const AT,bool dereferencePtr)1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1240     bool dereferencePtr) {
1241   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1242                                     dereferencePtr) * AT->getNumElements())
1243                      : 0);
1244 }
1245 
getTypeSize(VectorType * const VT,bool dereferencePtr)1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1247     bool dereferencePtr) {
1248   return VT ? (VT->getBitWidth() >> 3) : 0;
1249 }
1250 
getTypeSize(PointerType * const PT,bool dereferencePtr)1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1252     bool dereferencePtr) {
1253   if (!PT) {
1254     return 0;
1255   }
1256   Type *CT = PT->getElementType();
1257   if (CT->getTypeID() == Type::StructTyID &&
1258       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1259     return getTypeSize(dyn_cast<StructType>(CT));
1260   } else if (dereferencePtr) {
1261     size_t size = 0;
1262     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1263       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1264     }
1265     return size;
1266   } else {
1267     return 4;
1268   }
1269 }
1270 
getTypeSize(OpaqueType * const OT,bool dereferencePtr)1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1272     bool dereferencePtr) {
1273   //assert(0 && "Should not be able to calculate the size of an opaque type");
1274   return 4;
1275 }
1276