1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/InstVisitor.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Support/raw_ostream.h"
22
23 #define DEBUG_TYPE "amdgpu-promote-alloca"
24
25 using namespace llvm;
26
27 namespace {
28
29 class AMDGPUPromoteAlloca : public FunctionPass,
30 public InstVisitor<AMDGPUPromoteAlloca> {
31
32 static char ID;
33 Module *Mod;
34 const AMDGPUSubtarget &ST;
35 int LocalMemAvailable;
36
37 public:
AMDGPUPromoteAlloca(const AMDGPUSubtarget & st)38 AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
39 LocalMemAvailable(0) { }
40 bool doInitialization(Module &M) override;
41 bool runOnFunction(Function &F) override;
getPassName() const42 const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
43 void visitAlloca(AllocaInst &I);
44 };
45
46 } // End anonymous namespace
47
48 char AMDGPUPromoteAlloca::ID = 0;
49
doInitialization(Module & M)50 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
51 Mod = &M;
52 return false;
53 }
54
runOnFunction(Function & F)55 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
56
57 const FunctionType *FTy = F.getFunctionType();
58
59 LocalMemAvailable = ST.getLocalMemorySize();
60
61
62 // If the function has any arguments in the local address space, then it's
63 // possible these arguments require the entire local memory space, so
64 // we cannot use local memory in the pass.
65 for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
66 const Type *ParamTy = FTy->getParamType(i);
67 if (ParamTy->isPointerTy() &&
68 ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
69 LocalMemAvailable = 0;
70 DEBUG(dbgs() << "Function has local memory argument. Promoting to "
71 "local memory disabled.\n");
72 break;
73 }
74 }
75
76 if (LocalMemAvailable > 0) {
77 // Check how much local memory is being used by global objects
78 for (Module::global_iterator I = Mod->global_begin(),
79 E = Mod->global_end(); I != E; ++I) {
80 GlobalVariable *GV = I;
81 PointerType *GVTy = GV->getType();
82 if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
83 continue;
84 for (Value::use_iterator U = GV->use_begin(),
85 UE = GV->use_end(); U != UE; ++U) {
86 Instruction *Use = dyn_cast<Instruction>(*U);
87 if (!Use)
88 continue;
89 if (Use->getParent()->getParent() == &F)
90 LocalMemAvailable -=
91 Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
92 }
93 }
94 }
95
96 LocalMemAvailable = std::max(0, LocalMemAvailable);
97 DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
98
99 visit(F);
100
101 return false;
102 }
103
arrayTypeToVecType(const Type * ArrayTy)104 static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
105 return VectorType::get(ArrayTy->getArrayElementType(),
106 ArrayTy->getArrayNumElements());
107 }
108
109 static Value *
calculateVectorIndex(Value * Ptr,const std::map<GetElementPtrInst *,Value * > & GEPIdx)110 calculateVectorIndex(Value *Ptr,
111 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
112 if (isa<AllocaInst>(Ptr))
113 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
114
115 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
116
117 auto I = GEPIdx.find(GEP);
118 return I == GEPIdx.end() ? nullptr : I->second;
119 }
120
GEPToVectorIndex(GetElementPtrInst * GEP)121 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
122 // FIXME we only support simple cases
123 if (GEP->getNumOperands() != 3)
124 return NULL;
125
126 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
127 if (!I0 || !I0->isZero())
128 return NULL;
129
130 return GEP->getOperand(2);
131 }
132
133 // Not an instruction handled below to turn into a vector.
134 //
135 // TODO: Check isTriviallyVectorizable for calls and handle other
136 // instructions.
canVectorizeInst(Instruction * Inst)137 static bool canVectorizeInst(Instruction *Inst) {
138 switch (Inst->getOpcode()) {
139 case Instruction::Load:
140 case Instruction::Store:
141 case Instruction::BitCast:
142 case Instruction::AddrSpaceCast:
143 return true;
144 default:
145 return false;
146 }
147 }
148
tryPromoteAllocaToVector(AllocaInst * Alloca)149 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
150 Type *AllocaTy = Alloca->getAllocatedType();
151
152 DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
153
154 // FIXME: There is no reason why we can't support larger arrays, we
155 // are just being conservative for now.
156 if (!AllocaTy->isArrayTy() ||
157 AllocaTy->getArrayElementType()->isVectorTy() ||
158 AllocaTy->getArrayNumElements() > 4) {
159
160 DEBUG(dbgs() << " Cannot convert type to vector");
161 return false;
162 }
163
164 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
165 std::vector<Value*> WorkList;
166 for (User *AllocaUser : Alloca->users()) {
167 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
168 if (!GEP) {
169 if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
170 return false;
171
172 WorkList.push_back(AllocaUser);
173 continue;
174 }
175
176 Value *Index = GEPToVectorIndex(GEP);
177
178 // If we can't compute a vector index from this GEP, then we can't
179 // promote this alloca to vector.
180 if (!Index) {
181 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
182 return false;
183 }
184
185 GEPVectorIdx[GEP] = Index;
186 for (User *GEPUser : AllocaUser->users()) {
187 if (!canVectorizeInst(cast<Instruction>(GEPUser)))
188 return false;
189
190 WorkList.push_back(GEPUser);
191 }
192 }
193
194 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
195
196 DEBUG(dbgs() << " Converting alloca to vector "
197 << *AllocaTy << " -> " << *VectorTy << '\n');
198
199 for (std::vector<Value*>::iterator I = WorkList.begin(),
200 E = WorkList.end(); I != E; ++I) {
201 Instruction *Inst = cast<Instruction>(*I);
202 IRBuilder<> Builder(Inst);
203 switch (Inst->getOpcode()) {
204 case Instruction::Load: {
205 Value *Ptr = Inst->getOperand(0);
206 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
207 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
208 Value *VecValue = Builder.CreateLoad(BitCast);
209 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
210 Inst->replaceAllUsesWith(ExtractElement);
211 Inst->eraseFromParent();
212 break;
213 }
214 case Instruction::Store: {
215 Value *Ptr = Inst->getOperand(1);
216 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
217 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
218 Value *VecValue = Builder.CreateLoad(BitCast);
219 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
220 Inst->getOperand(0),
221 Index);
222 Builder.CreateStore(NewVecValue, BitCast);
223 Inst->eraseFromParent();
224 break;
225 }
226 case Instruction::BitCast:
227 case Instruction::AddrSpaceCast:
228 break;
229
230 default:
231 Inst->dump();
232 llvm_unreachable("Inconsistency in instructions promotable to vector");
233 }
234 }
235 return true;
236 }
237
collectUsesWithPtrTypes(Value * Val,std::vector<Value * > & WorkList)238 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
239 bool Success = true;
240 for (User *User : Val->users()) {
241 if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
242 continue;
243 if (isa<CallInst>(User)) {
244 WorkList.push_back(User);
245 continue;
246 }
247
248 // FIXME: Correctly handle ptrtoint instructions.
249 Instruction *UseInst = dyn_cast<Instruction>(User);
250 if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
251 return false;
252
253 if (!User->getType()->isPointerTy())
254 continue;
255
256 WorkList.push_back(User);
257
258 Success &= collectUsesWithPtrTypes(User, WorkList);
259 }
260 return Success;
261 }
262
visitAlloca(AllocaInst & I)263 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
264 IRBuilder<> Builder(&I);
265
266 // First try to replace the alloca with a vector
267 Type *AllocaTy = I.getAllocatedType();
268
269 DEBUG(dbgs() << "Trying to promote " << I << '\n');
270
271 if (tryPromoteAllocaToVector(&I))
272 return;
273
274 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
275
276 // FIXME: This is the maximum work group size. We should try to get
277 // value from the reqd_work_group_size function attribute if it is
278 // available.
279 unsigned WorkGroupSize = 256;
280 int AllocaSize =
281 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
282
283 if (AllocaSize > LocalMemAvailable) {
284 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
285 return;
286 }
287
288 std::vector<Value*> WorkList;
289
290 if (!collectUsesWithPtrTypes(&I, WorkList)) {
291 DEBUG(dbgs() << " Do not know how to convert all uses\n");
292 return;
293 }
294
295 DEBUG(dbgs() << "Promoting alloca to local memory\n");
296 LocalMemAvailable -= AllocaSize;
297
298 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
299 GlobalVariable *GV = new GlobalVariable(
300 *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
301 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
302
303 FunctionType *FTy = FunctionType::get(
304 Type::getInt32Ty(Mod->getContext()), false);
305 AttributeSet AttrSet;
306 AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
307
308 Value *ReadLocalSizeY = Mod->getOrInsertFunction(
309 "llvm.r600.read.local.size.y", FTy, AttrSet);
310 Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
311 "llvm.r600.read.local.size.z", FTy, AttrSet);
312 Value *ReadTIDIGX = Mod->getOrInsertFunction(
313 "llvm.r600.read.tidig.x", FTy, AttrSet);
314 Value *ReadTIDIGY = Mod->getOrInsertFunction(
315 "llvm.r600.read.tidig.y", FTy, AttrSet);
316 Value *ReadTIDIGZ = Mod->getOrInsertFunction(
317 "llvm.r600.read.tidig.z", FTy, AttrSet);
318
319
320 Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
321 Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
322 Value *TIdX = Builder.CreateCall(ReadTIDIGX);
323 Value *TIdY = Builder.CreateCall(ReadTIDIGY);
324 Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
325
326 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
327 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
328 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
329 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
330 TID = Builder.CreateAdd(TID, TIdZ);
331
332 std::vector<Value*> Indices;
333 Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
334 Indices.push_back(TID);
335
336 Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
337 I.mutateType(Offset->getType());
338 I.replaceAllUsesWith(Offset);
339 I.eraseFromParent();
340
341 for (std::vector<Value*>::iterator i = WorkList.begin(),
342 e = WorkList.end(); i != e; ++i) {
343 Value *V = *i;
344 CallInst *Call = dyn_cast<CallInst>(V);
345 if (!Call) {
346 Type *EltTy = V->getType()->getPointerElementType();
347 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
348
349 // The operand's value should be corrected on its own.
350 if (isa<AddrSpaceCastInst>(V))
351 continue;
352
353 // FIXME: It doesn't really make sense to try to do this for all
354 // instructions.
355 V->mutateType(NewTy);
356 continue;
357 }
358
359 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
360 if (!Intr) {
361 std::vector<Type*> ArgTypes;
362 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
363 ArgIdx != ArgEnd; ++ArgIdx) {
364 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
365 }
366 Function *F = Call->getCalledFunction();
367 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
368 F->isVarArg());
369 Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
370 NewType, F->getAttributes());
371 Function *NewF = cast<Function>(C);
372 Call->setCalledFunction(NewF);
373 continue;
374 }
375
376 Builder.SetInsertPoint(Intr);
377 switch (Intr->getIntrinsicID()) {
378 case Intrinsic::lifetime_start:
379 case Intrinsic::lifetime_end:
380 // These intrinsics are for address space 0 only
381 Intr->eraseFromParent();
382 continue;
383 case Intrinsic::memcpy: {
384 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
385 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
386 MemCpy->getLength(), MemCpy->getAlignment(),
387 MemCpy->isVolatile());
388 Intr->eraseFromParent();
389 continue;
390 }
391 case Intrinsic::memset: {
392 MemSetInst *MemSet = cast<MemSetInst>(Intr);
393 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
394 MemSet->getLength(), MemSet->getAlignment(),
395 MemSet->isVolatile());
396 Intr->eraseFromParent();
397 continue;
398 }
399 default:
400 Intr->dump();
401 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
402 }
403 }
404 }
405
createAMDGPUPromoteAlloca(const AMDGPUSubtarget & ST)406 FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
407 return new AMDGPUPromoteAlloca(ST);
408 }
409