1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/InstVisitor.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 #define DEBUG_TYPE "amdgpu-promote-alloca"
24 
25 using namespace llvm;
26 
27 namespace {
28 
29 class AMDGPUPromoteAlloca : public FunctionPass,
30                        public InstVisitor<AMDGPUPromoteAlloca> {
31 
32   static char ID;
33   Module *Mod;
34   const AMDGPUSubtarget &ST;
35   int LocalMemAvailable;
36 
37 public:
AMDGPUPromoteAlloca(const AMDGPUSubtarget & st)38   AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
39                                                    LocalMemAvailable(0) { }
40   bool doInitialization(Module &M) override;
41   bool runOnFunction(Function &F) override;
getPassName() const42   const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
43   void visitAlloca(AllocaInst &I);
44 };
45 
46 } // End anonymous namespace
47 
48 char AMDGPUPromoteAlloca::ID = 0;
49 
doInitialization(Module & M)50 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
51   Mod = &M;
52   return false;
53 }
54 
runOnFunction(Function & F)55 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
56 
57   FunctionType *FTy = F.getFunctionType();
58 
59   LocalMemAvailable = ST.getLocalMemorySize();
60 
61 
62   // If the function has any arguments in the local address space, then it's
63   // possible these arguments require the entire local memory space, so
64   // we cannot use local memory in the pass.
65   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
66     Type *ParamTy = FTy->getParamType(i);
67     if (ParamTy->isPointerTy() &&
68         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
69       LocalMemAvailable = 0;
70       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
71                       "local memory disabled.\n");
72       break;
73     }
74   }
75 
76   if (LocalMemAvailable > 0) {
77     // Check how much local memory is being used by global objects
78     for (Module::global_iterator I = Mod->global_begin(),
79                                  E = Mod->global_end(); I != E; ++I) {
80       GlobalVariable *GV = &*I;
81       PointerType *GVTy = GV->getType();
82       if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
83         continue;
84       for (Value::use_iterator U = GV->use_begin(),
85                                UE = GV->use_end(); U != UE; ++U) {
86         Instruction *Use = dyn_cast<Instruction>(*U);
87         if (!Use)
88           continue;
89         if (Use->getParent()->getParent() == &F)
90           LocalMemAvailable -=
91               Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
92       }
93     }
94   }
95 
96   LocalMemAvailable = std::max(0, LocalMemAvailable);
97   DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
98 
99   visit(F);
100 
101   return false;
102 }
103 
arrayTypeToVecType(Type * ArrayTy)104 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
105   return VectorType::get(ArrayTy->getArrayElementType(),
106                          ArrayTy->getArrayNumElements());
107 }
108 
109 static Value *
calculateVectorIndex(Value * Ptr,const std::map<GetElementPtrInst *,Value * > & GEPIdx)110 calculateVectorIndex(Value *Ptr,
111                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
112   if (isa<AllocaInst>(Ptr))
113     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
114 
115   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
116 
117   auto I = GEPIdx.find(GEP);
118   return I == GEPIdx.end() ? nullptr : I->second;
119 }
120 
GEPToVectorIndex(GetElementPtrInst * GEP)121 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
122   // FIXME we only support simple cases
123   if (GEP->getNumOperands() != 3)
124     return NULL;
125 
126   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
127   if (!I0 || !I0->isZero())
128     return NULL;
129 
130   return GEP->getOperand(2);
131 }
132 
133 // Not an instruction handled below to turn into a vector.
134 //
135 // TODO: Check isTriviallyVectorizable for calls and handle other
136 // instructions.
canVectorizeInst(Instruction * Inst,User * User)137 static bool canVectorizeInst(Instruction *Inst, User *User) {
138   switch (Inst->getOpcode()) {
139   case Instruction::Load:
140   case Instruction::BitCast:
141   case Instruction::AddrSpaceCast:
142     return true;
143   case Instruction::Store: {
144     // Must be the stored pointer operand, not a stored value.
145     StoreInst *SI = cast<StoreInst>(Inst);
146     return SI->getPointerOperand() == User;
147   }
148   default:
149     return false;
150   }
151 }
152 
tryPromoteAllocaToVector(AllocaInst * Alloca)153 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
154   Type *AllocaTy = Alloca->getAllocatedType();
155 
156   DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
157 
158   // FIXME: There is no reason why we can't support larger arrays, we
159   // are just being conservative for now.
160   if (!AllocaTy->isArrayTy() ||
161       AllocaTy->getArrayElementType()->isVectorTy() ||
162       AllocaTy->getArrayNumElements() > 4) {
163 
164     DEBUG(dbgs() << "  Cannot convert type to vector");
165     return false;
166   }
167 
168   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
169   std::vector<Value*> WorkList;
170   for (User *AllocaUser : Alloca->users()) {
171     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
172     if (!GEP) {
173       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
174         return false;
175 
176       WorkList.push_back(AllocaUser);
177       continue;
178     }
179 
180     Value *Index = GEPToVectorIndex(GEP);
181 
182     // If we can't compute a vector index from this GEP, then we can't
183     // promote this alloca to vector.
184     if (!Index) {
185       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
186       return false;
187     }
188 
189     GEPVectorIdx[GEP] = Index;
190     for (User *GEPUser : AllocaUser->users()) {
191       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
192         return false;
193 
194       WorkList.push_back(GEPUser);
195     }
196   }
197 
198   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
199 
200   DEBUG(dbgs() << "  Converting alloca to vector "
201         << *AllocaTy << " -> " << *VectorTy << '\n');
202 
203   for (std::vector<Value*>::iterator I = WorkList.begin(),
204                                      E = WorkList.end(); I != E; ++I) {
205     Instruction *Inst = cast<Instruction>(*I);
206     IRBuilder<> Builder(Inst);
207     switch (Inst->getOpcode()) {
208     case Instruction::Load: {
209       Value *Ptr = Inst->getOperand(0);
210       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
211       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
212       Value *VecValue = Builder.CreateLoad(BitCast);
213       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
214       Inst->replaceAllUsesWith(ExtractElement);
215       Inst->eraseFromParent();
216       break;
217     }
218     case Instruction::Store: {
219       Value *Ptr = Inst->getOperand(1);
220       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
221       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
222       Value *VecValue = Builder.CreateLoad(BitCast);
223       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
224                                                        Inst->getOperand(0),
225                                                        Index);
226       Builder.CreateStore(NewVecValue, BitCast);
227       Inst->eraseFromParent();
228       break;
229     }
230     case Instruction::BitCast:
231     case Instruction::AddrSpaceCast:
232       break;
233 
234     default:
235       Inst->dump();
236       llvm_unreachable("Inconsistency in instructions promotable to vector");
237     }
238   }
239   return true;
240 }
241 
collectUsesWithPtrTypes(Value * Val,std::vector<Value * > & WorkList)242 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
243   bool Success = true;
244   for (User *User : Val->users()) {
245     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
246       continue;
247     if (CallInst *CI = dyn_cast<CallInst>(User)) {
248       // TODO: We might be able to handle some cases where the callee is a
249       // constantexpr bitcast of a function.
250       if (!CI->getCalledFunction())
251         return false;
252 
253       WorkList.push_back(User);
254       continue;
255     }
256 
257     // FIXME: Correctly handle ptrtoint instructions.
258     Instruction *UseInst = dyn_cast<Instruction>(User);
259     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
260       return false;
261 
262     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
263       // Reject if the stored value is not the pointer operand.
264       if (SI->getPointerOperand() != Val)
265         return false;
266     }
267 
268     if (!User->getType()->isPointerTy())
269       continue;
270 
271     WorkList.push_back(User);
272 
273     Success &= collectUsesWithPtrTypes(User, WorkList);
274   }
275   return Success;
276 }
277 
visitAlloca(AllocaInst & I)278 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
279   if (!I.isStaticAlloca())
280     return;
281 
282   IRBuilder<> Builder(&I);
283 
284   // First try to replace the alloca with a vector
285   Type *AllocaTy = I.getAllocatedType();
286 
287   DEBUG(dbgs() << "Trying to promote " << I << '\n');
288 
289   if (tryPromoteAllocaToVector(&I))
290     return;
291 
292   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
293 
294   // FIXME: This is the maximum work group size.  We should try to get
295   // value from the reqd_work_group_size function attribute if it is
296   // available.
297   unsigned WorkGroupSize = 256;
298   int AllocaSize =
299       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
300 
301   if (AllocaSize > LocalMemAvailable) {
302     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
303     return;
304   }
305 
306   std::vector<Value*> WorkList;
307 
308   if (!collectUsesWithPtrTypes(&I, WorkList)) {
309     DEBUG(dbgs() << " Do not know how to convert all uses\n");
310     return;
311   }
312 
313   DEBUG(dbgs() << "Promoting alloca to local memory\n");
314   LocalMemAvailable -= AllocaSize;
315 
316   Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
317   GlobalVariable *GV = new GlobalVariable(
318       *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
319       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
320 
321   FunctionType *FTy = FunctionType::get(
322       Type::getInt32Ty(Mod->getContext()), false);
323   AttributeSet AttrSet;
324   AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
325 
326   Value *ReadLocalSizeY = Mod->getOrInsertFunction(
327       "llvm.r600.read.local.size.y", FTy, AttrSet);
328   Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
329       "llvm.r600.read.local.size.z", FTy, AttrSet);
330   Value *ReadTIDIGX = Mod->getOrInsertFunction(
331       "llvm.r600.read.tidig.x", FTy, AttrSet);
332   Value *ReadTIDIGY = Mod->getOrInsertFunction(
333       "llvm.r600.read.tidig.y", FTy, AttrSet);
334   Value *ReadTIDIGZ = Mod->getOrInsertFunction(
335       "llvm.r600.read.tidig.z", FTy, AttrSet);
336 
337   Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {});
338   Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {});
339   Value *TIdX = Builder.CreateCall(ReadTIDIGX, {});
340   Value *TIdY = Builder.CreateCall(ReadTIDIGY, {});
341   Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {});
342 
343   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
344   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
345   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
346   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
347   TID = Builder.CreateAdd(TID, TIdZ);
348 
349   std::vector<Value*> Indices;
350   Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
351   Indices.push_back(TID);
352 
353   Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
354   I.mutateType(Offset->getType());
355   I.replaceAllUsesWith(Offset);
356   I.eraseFromParent();
357 
358   for (std::vector<Value*>::iterator i = WorkList.begin(),
359                                      e = WorkList.end(); i != e; ++i) {
360     Value *V = *i;
361     CallInst *Call = dyn_cast<CallInst>(V);
362     if (!Call) {
363       Type *EltTy = V->getType()->getPointerElementType();
364       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
365 
366       // The operand's value should be corrected on its own.
367       if (isa<AddrSpaceCastInst>(V))
368         continue;
369 
370       // FIXME: It doesn't really make sense to try to do this for all
371       // instructions.
372       V->mutateType(NewTy);
373       continue;
374     }
375 
376     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
377     if (!Intr) {
378       std::vector<Type*> ArgTypes;
379       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
380                                 ArgIdx != ArgEnd; ++ArgIdx) {
381         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
382       }
383       Function *F = Call->getCalledFunction();
384       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
385                                                 F->isVarArg());
386       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
387                                              NewType, F->getAttributes());
388       Function *NewF = cast<Function>(C);
389       Call->setCalledFunction(NewF);
390       continue;
391     }
392 
393     Builder.SetInsertPoint(Intr);
394     switch (Intr->getIntrinsicID()) {
395     case Intrinsic::lifetime_start:
396     case Intrinsic::lifetime_end:
397       // These intrinsics are for address space 0 only
398       Intr->eraseFromParent();
399       continue;
400     case Intrinsic::memcpy: {
401       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
402       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
403                            MemCpy->getLength(), MemCpy->getAlignment(),
404                            MemCpy->isVolatile());
405       Intr->eraseFromParent();
406       continue;
407     }
408     case Intrinsic::memset: {
409       MemSetInst *MemSet = cast<MemSetInst>(Intr);
410       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
411                            MemSet->getLength(), MemSet->getAlignment(),
412                            MemSet->isVolatile());
413       Intr->eraseFromParent();
414       continue;
415     }
416     default:
417       Intr->dump();
418       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
419     }
420   }
421 }
422 
createAMDGPUPromoteAlloca(const AMDGPUSubtarget & ST)423 FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
424   return new AMDGPUPromoteAlloca(ST);
425 }
426