1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "amdgpu-subtarget"
34
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43
44 GCNSubtarget::~GCNSubtarget() = default;
45
46 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48 StringRef GPU, StringRef FS) {
49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50 FullFS += FS;
51 ParseSubtargetFeatures(GPU, FullFS);
52
53 // FIXME: I don't think think Evergreen has any useful support for
54 // denormals, but should be checked. Should we issue a warning somewhere
55 // if someone tries to enable these?
56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57 FP32Denormals = false;
58 }
59
60 HasMulU24 = getGeneration() >= EVERGREEN;
61 HasMulI24 = hasCaymanISA();
62
63 return *this;
64 }
65
66 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68 StringRef GPU, StringRef FS) {
69 // Determine default and user-specified characteristics
70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71 // enabled, but some instructions do not respect them and they run at the
72 // double precision rate, so don't enable by default.
73 //
74 // We want to be able to turn these off, but making this a subtarget feature
75 // for SI has the unhelpful behavior that it unsets everything else if you
76 // disable it.
77
78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79
80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82
83 // FIXME: I don't think think Evergreen has any useful support for
84 // denormals, but should be checked. Should we issue a warning somewhere
85 // if someone tries to enable these?
86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87 FullFS += "+fp64-fp16-denormals,";
88 } else {
89 FullFS += "-fp32-denormals,";
90 }
91
92 FullFS += FS;
93
94 ParseSubtargetFeatures(GPU, FullFS);
95
96 // We don't support FP64 for EG/NI atm.
97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98
99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101 // variants of MUBUF instructions.
102 if (!hasAddr64() && !FS.contains("flat-for-global")) {
103 FlatForGlobal = true;
104 }
105
106 // Set defaults if needed.
107 if (MaxPrivateElementSize == 0)
108 MaxPrivateElementSize = 4;
109
110 if (LDSBankCount == 0)
111 LDSBankCount = 32;
112
113 if (TT.getArch() == Triple::amdgcn) {
114 if (LocalMemorySize == 0)
115 LocalMemorySize = 32768;
116
117 // Do something sensible for unspecified target.
118 if (!HasMovrel && !HasVGPRIndexMode)
119 HasMovrel = true;
120 }
121
122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123
124 return *this;
125 }
126
AMDGPUSubtarget(const Triple & TT,const FeatureBitset & FeatureBits)127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
128 const FeatureBitset &FeatureBits) :
129 TargetTriple(TT),
130 SubtargetFeatureBits(FeatureBits),
131 Has16BitInsts(false),
132 HasMadMixInsts(false),
133 FP32Denormals(false),
134 FPExceptions(false),
135 HasSDWA(false),
136 HasVOP3PInsts(false),
137 HasMulI24(true),
138 HasMulU24(true),
139 HasFminFmaxLegacy(true),
140 EnablePromoteAlloca(false),
141 LocalMemorySize(0),
142 WavefrontSize(0)
143 { }
144
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
146 const GCNTargetMachine &TM) :
147 AMDGPUGenSubtargetInfo(TT, GPU, FS),
148 AMDGPUSubtarget(TT, getFeatureBits()),
149 TargetTriple(TT),
150 Gen(SOUTHERN_ISLANDS),
151 IsaVersion(ISAVersion0_0_0),
152 LDSBankCount(0),
153 MaxPrivateElementSize(0),
154
155 FastFMAF32(false),
156 HalfRate64Ops(false),
157
158 FP64FP16Denormals(false),
159 DX10Clamp(false),
160 FlatForGlobal(false),
161 AutoWaitcntBeforeBarrier(false),
162 CodeObjectV3(false),
163 UnalignedScratchAccess(false),
164 UnalignedBufferAccess(false),
165
166 HasApertureRegs(false),
167 EnableXNACK(false),
168 TrapHandler(false),
169 DebuggerInsertNops(false),
170 DebuggerEmitPrologue(false),
171
172 EnableHugePrivateBuffer(false),
173 EnableVGPRSpilling(false),
174 EnableLoadStoreOpt(false),
175 EnableUnsafeDSOffsetFolding(false),
176 EnableSIScheduler(false),
177 EnableDS128(false),
178 DumpCode(false),
179
180 FP64(false),
181 GCN3Encoding(false),
182 CIInsts(false),
183 GFX9Insts(false),
184 SGPRInitBug(false),
185 HasSMemRealTime(false),
186 HasIntClamp(false),
187 HasFmaMixInsts(false),
188 HasMovrel(false),
189 HasVGPRIndexMode(false),
190 HasScalarStores(false),
191 HasScalarAtomics(false),
192 HasInv2PiInlineImm(false),
193 HasSDWAOmod(false),
194 HasSDWAScalar(false),
195 HasSDWASdst(false),
196 HasSDWAMac(false),
197 HasSDWAOutModsVOPC(false),
198 HasDPP(false),
199 HasDLInsts(false),
200 D16PreservesUnusedBits(false),
201 FlatAddressSpace(false),
202 FlatInstOffsets(false),
203 FlatGlobalInsts(false),
204 FlatScratchInsts(false),
205 AddNoCarryInsts(false),
206 HasUnpackedD16VMem(false),
207
208 ScalarizeGlobal(false),
209
210 FeatureDisable(false),
211 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
212 TLInfo(TM, *this),
213 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
214 AS = AMDGPU::getAMDGPUAS(TT);
215 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
216 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
217 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
218 InstSelector.reset(new AMDGPUInstructionSelector(
219 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
220 }
221
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
223 const Function &F) const {
224 if (NWaves == 1)
225 return getLocalMemorySize();
226 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
227 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
228 unsigned MaxWaves = getMaxWavesPerEU();
229 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
230 }
231
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
233 const Function &F) const {
234 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
235 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
236 unsigned MaxWaves = getMaxWavesPerEU();
237 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
238 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
239 NumWaves = std::min(NumWaves, MaxWaves);
240 NumWaves = std::max(NumWaves, 1u);
241 return NumWaves;
242 }
243
244 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
246 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
247 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
248 }
249
250 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
252 switch (CC) {
253 case CallingConv::AMDGPU_CS:
254 case CallingConv::AMDGPU_KERNEL:
255 case CallingConv::SPIR_KERNEL:
256 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
257 case CallingConv::AMDGPU_VS:
258 case CallingConv::AMDGPU_LS:
259 case CallingConv::AMDGPU_HS:
260 case CallingConv::AMDGPU_ES:
261 case CallingConv::AMDGPU_GS:
262 case CallingConv::AMDGPU_PS:
263 return std::make_pair(1, getWavefrontSize());
264 default:
265 return std::make_pair(1, 16 * getWavefrontSize());
266 }
267 }
268
getFlatWorkGroupSizes(const Function & F) const269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
270 const Function &F) const {
271 // FIXME: 1024 if function.
272 // Default minimum/maximum flat work group sizes.
273 std::pair<unsigned, unsigned> Default =
274 getDefaultFlatWorkGroupSize(F.getCallingConv());
275
276 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
277 // starts using "amdgpu-flat-work-group-size" attribute.
278 Default.second = AMDGPU::getIntegerAttribute(
279 F, "amdgpu-max-work-group-size", Default.second);
280 Default.first = std::min(Default.first, Default.second);
281
282 // Requested minimum/maximum flat work group sizes.
283 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
284 F, "amdgpu-flat-work-group-size", Default);
285
286 // Make sure requested minimum is less than requested maximum.
287 if (Requested.first > Requested.second)
288 return Default;
289
290 // Make sure requested values do not violate subtarget's specifications.
291 if (Requested.first < getMinFlatWorkGroupSize())
292 return Default;
293 if (Requested.second > getMaxFlatWorkGroupSize())
294 return Default;
295
296 return Requested;
297 }
298
getWavesPerEU(const Function & F) const299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
300 const Function &F) const {
301 // Default minimum/maximum number of waves per execution unit.
302 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
303
304 // Default/requested minimum/maximum flat work group sizes.
305 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
306
307 // If minimum/maximum flat work group sizes were explicitly requested using
308 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
309 // number of waves per execution unit to values implied by requested
310 // minimum/maximum flat work group sizes.
311 unsigned MinImpliedByFlatWorkGroupSize =
312 getMaxWavesPerEU(FlatWorkGroupSizes.second);
313 bool RequestedFlatWorkGroupSize = false;
314
315 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
316 // starts using "amdgpu-flat-work-group-size" attribute.
317 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
318 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
319 Default.first = MinImpliedByFlatWorkGroupSize;
320 RequestedFlatWorkGroupSize = true;
321 }
322
323 // Requested minimum/maximum number of waves per execution unit.
324 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
325 F, "amdgpu-waves-per-eu", Default, true);
326
327 // Make sure requested minimum is less than requested maximum.
328 if (Requested.second && Requested.first > Requested.second)
329 return Default;
330
331 // Make sure requested values do not violate subtarget's specifications.
332 if (Requested.first < getMinWavesPerEU() ||
333 Requested.first > getMaxWavesPerEU())
334 return Default;
335 if (Requested.second > getMaxWavesPerEU())
336 return Default;
337
338 // Make sure requested values are compatible with values implied by requested
339 // minimum/maximum flat work group sizes.
340 if (RequestedFlatWorkGroupSize &&
341 Requested.first < MinImpliedByFlatWorkGroupSize)
342 return Default;
343
344 return Requested;
345 }
346
makeLIDRangeMetadata(Instruction * I) const347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
348 Function *Kernel = I->getParent()->getParent();
349 unsigned MinSize = 0;
350 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
351 bool IdQuery = false;
352
353 // If reqd_work_group_size is present it narrows value down.
354 if (auto *CI = dyn_cast<CallInst>(I)) {
355 const Function *F = CI->getCalledFunction();
356 if (F) {
357 unsigned Dim = UINT_MAX;
358 switch (F->getIntrinsicID()) {
359 case Intrinsic::amdgcn_workitem_id_x:
360 case Intrinsic::r600_read_tidig_x:
361 IdQuery = true;
362 LLVM_FALLTHROUGH;
363 case Intrinsic::r600_read_local_size_x:
364 Dim = 0;
365 break;
366 case Intrinsic::amdgcn_workitem_id_y:
367 case Intrinsic::r600_read_tidig_y:
368 IdQuery = true;
369 LLVM_FALLTHROUGH;
370 case Intrinsic::r600_read_local_size_y:
371 Dim = 1;
372 break;
373 case Intrinsic::amdgcn_workitem_id_z:
374 case Intrinsic::r600_read_tidig_z:
375 IdQuery = true;
376 LLVM_FALLTHROUGH;
377 case Intrinsic::r600_read_local_size_z:
378 Dim = 2;
379 break;
380 default:
381 break;
382 }
383 if (Dim <= 3) {
384 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
385 if (Node->getNumOperands() == 3)
386 MinSize = MaxSize = mdconst::extract<ConstantInt>(
387 Node->getOperand(Dim))->getZExtValue();
388 }
389 }
390 }
391
392 if (!MaxSize)
393 return false;
394
395 // Range metadata is [Lo, Hi). For ID query we need to pass max size
396 // as Hi. For size query we need to pass Hi + 1.
397 if (IdQuery)
398 MinSize = 0;
399 else
400 ++MaxSize;
401
402 MDBuilder MDB(I->getContext());
403 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
404 APInt(32, MaxSize));
405 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
406 return true;
407 }
408
getExplicitKernArgSize(const Function & F,unsigned & MaxAlign) const409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
410 unsigned &MaxAlign) const {
411 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
412 F.getCallingConv() == CallingConv::SPIR_KERNEL);
413
414 const DataLayout &DL = F.getParent()->getDataLayout();
415 uint64_t ExplicitArgBytes = 0;
416 MaxAlign = 1;
417
418 for (const Argument &Arg : F.args()) {
419 Type *ArgTy = Arg.getType();
420
421 unsigned Align = DL.getABITypeAlignment(ArgTy);
422 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
423 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
424 MaxAlign = std::max(MaxAlign, Align);
425 }
426
427 return ExplicitArgBytes;
428 }
429
getKernArgSegmentSize(const Function & F,unsigned & MaxAlign) const430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
431 unsigned &MaxAlign) const {
432 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
433
434 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
435
436 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
437 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
438 if (ImplicitBytes != 0) {
439 unsigned Alignment = getAlignmentForImplicitArgPtr();
440 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
441 }
442
443 // Being able to dereference past the end is useful for emitting scalar loads.
444 return alignTo(TotalSize, 4);
445 }
446
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
448 const TargetMachine &TM) :
449 R600GenSubtargetInfo(TT, GPU, FS),
450 AMDGPUSubtarget(TT, getFeatureBits()),
451 InstrInfo(*this),
452 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
453 FMA(false),
454 CaymanISA(false),
455 CFALUBug(false),
456 DX10Clamp(false),
457 HasVertexCache(false),
458 R600ALUInst(false),
459 FP64(false),
460 TexVTXClauseSize(0),
461 Gen(R600),
462 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
463 InstrItins(getInstrItineraryForCPU(GPU)),
464 AS (AMDGPU::getAMDGPUAS(TT)) { }
465
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
467 unsigned NumRegionInstrs) const {
468 // Track register pressure so the scheduler can try to decrease
469 // pressure once register usage is above the threshold defined by
470 // SIRegisterInfo::getRegPressureSetLimit()
471 Policy.ShouldTrackPressure = true;
472
473 // Enabling both top down and bottom up scheduling seems to give us less
474 // register spills than just using one of these approaches on its own.
475 Policy.OnlyTopDown = false;
476 Policy.OnlyBottomUp = false;
477
478 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
479 if (!enableSIScheduler())
480 Policy.ShouldTrackLaneMasks = true;
481 }
482
isVGPRSpillingEnabled(const Function & F) const483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
484 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
485 }
486
getOccupancyWithNumSGPRs(unsigned SGPRs) const487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
488 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
489 if (SGPRs <= 80)
490 return 10;
491 if (SGPRs <= 88)
492 return 9;
493 if (SGPRs <= 100)
494 return 8;
495 return 7;
496 }
497 if (SGPRs <= 48)
498 return 10;
499 if (SGPRs <= 56)
500 return 9;
501 if (SGPRs <= 64)
502 return 8;
503 if (SGPRs <= 72)
504 return 7;
505 if (SGPRs <= 80)
506 return 6;
507 return 5;
508 }
509
getOccupancyWithNumVGPRs(unsigned VGPRs) const510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
511 if (VGPRs <= 24)
512 return 10;
513 if (VGPRs <= 28)
514 return 9;
515 if (VGPRs <= 32)
516 return 8;
517 if (VGPRs <= 36)
518 return 7;
519 if (VGPRs <= 40)
520 return 6;
521 if (VGPRs <= 48)
522 return 5;
523 if (VGPRs <= 64)
524 return 4;
525 if (VGPRs <= 84)
526 return 3;
527 if (VGPRs <= 128)
528 return 2;
529 return 1;
530 }
531
getReservedNumSGPRs(const MachineFunction & MF) const532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
533 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
534 if (MFI.hasFlatScratchInit()) {
535 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
536 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
537 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
538 return 4; // FLAT_SCRATCH, VCC (in that order).
539 }
540
541 if (isXNACKEnabled())
542 return 4; // XNACK, VCC (in that order).
543 return 2; // VCC.
544 }
545
getMaxNumSGPRs(const MachineFunction & MF) const546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
547 const Function &F = MF.getFunction();
548 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
549
550 // Compute maximum number of SGPRs function can use using default/requested
551 // minimum number of waves per execution unit.
552 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
553 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
554 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
555
556 // Check if maximum number of SGPRs was explicitly requested using
557 // "amdgpu-num-sgpr" attribute.
558 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
559 unsigned Requested = AMDGPU::getIntegerAttribute(
560 F, "amdgpu-num-sgpr", MaxNumSGPRs);
561
562 // Make sure requested value does not violate subtarget's specifications.
563 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
564 Requested = 0;
565
566 // If more SGPRs are required to support the input user/system SGPRs,
567 // increase to accommodate them.
568 //
569 // FIXME: This really ends up using the requested number of SGPRs + number
570 // of reserved special registers in total. Theoretically you could re-use
571 // the last input registers for these special registers, but this would
572 // require a lot of complexity to deal with the weird aliasing.
573 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
574 if (Requested && Requested < InputNumSGPRs)
575 Requested = InputNumSGPRs;
576
577 // Make sure requested value is compatible with values implied by
578 // default/requested minimum/maximum number of waves per execution unit.
579 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
580 Requested = 0;
581 if (WavesPerEU.second &&
582 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
583 Requested = 0;
584
585 if (Requested)
586 MaxNumSGPRs = Requested;
587 }
588
589 if (hasSGPRInitBug())
590 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
591
592 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
593 MaxAddressableNumSGPRs);
594 }
595
getMaxNumVGPRs(const MachineFunction & MF) const596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
597 const Function &F = MF.getFunction();
598 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
599
600 // Compute maximum number of VGPRs function can use using default/requested
601 // minimum number of waves per execution unit.
602 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
603 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
604
605 // Check if maximum number of VGPRs was explicitly requested using
606 // "amdgpu-num-vgpr" attribute.
607 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
608 unsigned Requested = AMDGPU::getIntegerAttribute(
609 F, "amdgpu-num-vgpr", MaxNumVGPRs);
610
611 // Make sure requested value is compatible with values implied by
612 // default/requested minimum/maximum number of waves per execution unit.
613 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
614 Requested = 0;
615 if (WavesPerEU.second &&
616 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
617 Requested = 0;
618
619 if (Requested)
620 MaxNumVGPRs = Requested;
621 }
622
623 return MaxNumVGPRs;
624 }
625
626 namespace {
627 struct MemOpClusterMutation : ScheduleDAGMutation {
628 const SIInstrInfo *TII;
629
MemOpClusterMutation__anonf96fb1bc0111::MemOpClusterMutation630 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
631
apply__anonf96fb1bc0111::MemOpClusterMutation632 void apply(ScheduleDAGInstrs *DAGInstrs) override {
633 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
634
635 SUnit *SUa = nullptr;
636 // Search for two consequent memory operations and link them
637 // to prevent scheduler from moving them apart.
638 // In DAG pre-process SUnits are in the original order of
639 // the instructions before scheduling.
640 for (SUnit &SU : DAG->SUnits) {
641 MachineInstr &MI2 = *SU.getInstr();
642 if (!MI2.mayLoad() && !MI2.mayStore()) {
643 SUa = nullptr;
644 continue;
645 }
646 if (!SUa) {
647 SUa = &SU;
648 continue;
649 }
650
651 MachineInstr &MI1 = *SUa->getInstr();
652 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
653 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
654 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
655 (TII->isDS(MI1) && TII->isDS(MI2))) {
656 SU.addPredBarrier(SUa);
657
658 for (const SDep &SI : SU.Preds) {
659 if (SI.getSUnit() != SUa)
660 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
661 }
662
663 if (&SU != &DAG->ExitSU) {
664 for (const SDep &SI : SUa->Succs) {
665 if (SI.getSUnit() != &SU)
666 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
667 }
668 }
669 }
670
671 SUa = &SU;
672 }
673 }
674 };
675 } // namespace
676
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const677 void GCNSubtarget::getPostRAMutations(
678 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
679 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
680 }
681
get(const MachineFunction & MF)682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
683 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
684 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
685 else
686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
687 }
688
get(const TargetMachine & TM,const Function & F)689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
690 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
691 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
692 else
693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
694 }
695