1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Memory legalizer - implements memory model. More information can be
12 /// found here:
13 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPU.h"
18 #include "AMDGPUMachineModuleInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "SIDefines.h"
21 #include "SIInstrInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/BitmaskEnum.h"
25 #include "llvm/ADT/None.h"
26 #include "llvm/ADT/Optional.h"
27 #include "llvm/CodeGen/MachineBasicBlock.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineFunctionPass.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineMemOperand.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/LLVMContext.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/AtomicOrdering.h"
41 #include "llvm/Support/MathExtras.h"
42 #include <cassert>
43 #include <list>
44
45 using namespace llvm;
46 using namespace llvm::AMDGPU;
47
48 #define DEBUG_TYPE "si-memory-legalizer"
49 #define PASS_NAME "SI Memory Legalizer"
50
51 namespace {
52
53 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
54
55 /// Memory operation flags. Can be ORed together.
56 enum class SIMemOp {
57 NONE = 0u,
58 LOAD = 1u << 0,
59 STORE = 1u << 1,
60 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
61 };
62
63 /// Position to insert a new instruction relative to an existing
64 /// instruction.
65 enum class Position {
66 BEFORE,
67 AFTER
68 };
69
70 /// The atomic synchronization scopes supported by the AMDGPU target.
71 enum class SIAtomicScope {
72 NONE,
73 SINGLETHREAD,
74 WAVEFRONT,
75 WORKGROUP,
76 AGENT,
77 SYSTEM
78 };
79
80 /// The distinct address spaces supported by the AMDGPU target for
81 /// atomic memory operation. Can be ORed toether.
82 enum class SIAtomicAddrSpace {
83 NONE = 0u,
84 GLOBAL = 1u << 0,
85 LDS = 1u << 1,
86 SCRATCH = 1u << 2,
87 GDS = 1u << 3,
88 OTHER = 1u << 4,
89
90 /// The address spaces that can be accessed by a FLAT instruction.
91 FLAT = GLOBAL | LDS | SCRATCH,
92
93 /// The address spaces that support atomic instructions.
94 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
95
96 /// All address spaces.
97 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
98
99 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
100 };
101
102 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
103 /// \returns Returns true if \p MI is modified, false otherwise.
104 template <uint16_t BitName>
enableNamedBit(const MachineBasicBlock::iterator & MI)105 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
106 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
107 if (BitIdx == -1)
108 return false;
109
110 MachineOperand &Bit = MI->getOperand(BitIdx);
111 if (Bit.getImm() != 0)
112 return false;
113
114 Bit.setImm(1);
115 return true;
116 }
117
118 class SIMemOpInfo final {
119 private:
120
121 friend class SIMemOpAccess;
122
123 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
124 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
125 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
126 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
127 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
128 bool IsCrossAddressSpaceOrdering = false;
129 bool IsNonTemporal = false;
130
SIMemOpInfo(AtomicOrdering Ordering=AtomicOrdering::SequentiallyConsistent,SIAtomicScope Scope=SIAtomicScope::SYSTEM,SIAtomicAddrSpace OrderingAddrSpace=SIAtomicAddrSpace::ATOMIC,SIAtomicAddrSpace InstrAddrSpace=SIAtomicAddrSpace::ALL,bool IsCrossAddressSpaceOrdering=true,AtomicOrdering FailureOrdering=AtomicOrdering::SequentiallyConsistent,bool IsNonTemporal=false)131 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
132 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
133 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
134 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
135 bool IsCrossAddressSpaceOrdering = true,
136 AtomicOrdering FailureOrdering =
137 AtomicOrdering::SequentiallyConsistent,
138 bool IsNonTemporal = false)
139 : Ordering(Ordering), FailureOrdering(FailureOrdering),
140 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
141 InstrAddrSpace(InstrAddrSpace),
142 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
143 IsNonTemporal(IsNonTemporal) {
144 // There is also no cross address space ordering if the ordering
145 // address space is the same as the instruction address space and
146 // only contains a single address space.
147 if ((OrderingAddrSpace == InstrAddrSpace) &&
148 isPowerOf2_32(uint32_t(InstrAddrSpace)))
149 IsCrossAddressSpaceOrdering = false;
150 }
151
152 public:
153 /// \returns Atomic synchronization scope of the machine instruction used to
154 /// create this SIMemOpInfo.
getScope() const155 SIAtomicScope getScope() const {
156 return Scope;
157 }
158
159 /// \returns Ordering constraint of the machine instruction used to
160 /// create this SIMemOpInfo.
getOrdering() const161 AtomicOrdering getOrdering() const {
162 return Ordering;
163 }
164
165 /// \returns Failure ordering constraint of the machine instruction used to
166 /// create this SIMemOpInfo.
getFailureOrdering() const167 AtomicOrdering getFailureOrdering() const {
168 return FailureOrdering;
169 }
170
171 /// \returns The address spaces be accessed by the machine
172 /// instruction used to create this SiMemOpInfo.
getInstrAddrSpace() const173 SIAtomicAddrSpace getInstrAddrSpace() const {
174 return InstrAddrSpace;
175 }
176
177 /// \returns The address spaces that must be ordered by the machine
178 /// instruction used to create this SiMemOpInfo.
getOrderingAddrSpace() const179 SIAtomicAddrSpace getOrderingAddrSpace() const {
180 return OrderingAddrSpace;
181 }
182
183 /// \returns Return true iff memory ordering of operations on
184 /// different address spaces is required.
getIsCrossAddressSpaceOrdering() const185 bool getIsCrossAddressSpaceOrdering() const {
186 return IsCrossAddressSpaceOrdering;
187 }
188
189 /// \returns True if memory access of the machine instruction used to
190 /// create this SIMemOpInfo is non-temporal, false otherwise.
isNonTemporal() const191 bool isNonTemporal() const {
192 return IsNonTemporal;
193 }
194
195 /// \returns True if ordering constraint of the machine instruction used to
196 /// create this SIMemOpInfo is unordered or higher, false otherwise.
isAtomic() const197 bool isAtomic() const {
198 return Ordering != AtomicOrdering::NotAtomic;
199 }
200
201 };
202
203 class SIMemOpAccess final {
204 private:
205
206 AMDGPUAS SIAddrSpaceInfo;
207 AMDGPUMachineModuleInfo *MMI = nullptr;
208
209 /// Reports unsupported message \p Msg for \p MI to LLVM context.
210 void reportUnsupported(const MachineBasicBlock::iterator &MI,
211 const char *Msg) const;
212
213 /// Inspects the target synchonization scope \p SSID and determines
214 /// the SI atomic scope it corresponds to, the address spaces it
215 /// covers, and whether the memory ordering applies between address
216 /// spaces.
217 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
218 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
219
220 /// \return Return a bit set of the address spaces accessed by \p AS.
221 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
222
223 /// \returns Info constructed from \p MI, which has at least machine memory
224 /// operand.
225 Optional<SIMemOpInfo> constructFromMIWithMMO(
226 const MachineBasicBlock::iterator &MI) const;
227
228 public:
229 /// Construct class to support accessing the machine memory operands
230 /// of instructions in the machine function \p MF.
231 SIMemOpAccess(MachineFunction &MF);
232
233 /// \returns Load info if \p MI is a load operation, "None" otherwise.
234 Optional<SIMemOpInfo> getLoadInfo(
235 const MachineBasicBlock::iterator &MI) const;
236
237 /// \returns Store info if \p MI is a store operation, "None" otherwise.
238 Optional<SIMemOpInfo> getStoreInfo(
239 const MachineBasicBlock::iterator &MI) const;
240
241 /// \returns Atomic fence info if \p MI is an atomic fence operation,
242 /// "None" otherwise.
243 Optional<SIMemOpInfo> getAtomicFenceInfo(
244 const MachineBasicBlock::iterator &MI) const;
245
246 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
247 /// rmw operation, "None" otherwise.
248 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
249 const MachineBasicBlock::iterator &MI) const;
250 };
251
252 class SICacheControl {
253 protected:
254
255 /// Instruction info.
256 const SIInstrInfo *TII = nullptr;
257
258 IsaInfo::IsaVersion IV;
259
260 SICacheControl(const GCNSubtarget &ST);
261
262 public:
263
264 /// Create a cache control for the subtarget \p ST.
265 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
266
267 /// Update \p MI memory load instruction to bypass any caches up to
268 /// the \p Scope memory scope for address spaces \p
269 /// AddrSpace. Return true iff the instruction was modified.
270 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
271 SIAtomicScope Scope,
272 SIAtomicAddrSpace AddrSpace) const = 0;
273
274 /// Update \p MI memory instruction to indicate it is
275 /// nontemporal. Return true iff the instruction was modified.
276 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
277 const = 0;
278
279 /// Inserts any necessary instructions at position \p Pos relative
280 /// to instruction \p MI to ensure any caches associated with
281 /// address spaces \p AddrSpace for memory scopes up to memory scope
282 /// \p Scope are invalidated. Returns true iff any instructions
283 /// inserted.
284 virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
285 SIAtomicScope Scope,
286 SIAtomicAddrSpace AddrSpace,
287 Position Pos) const = 0;
288
289 /// Inserts any necessary instructions at position \p Pos relative
290 /// to instruction \p MI to ensure memory instructions of kind \p Op
291 /// associated with address spaces \p AddrSpace have completed as
292 /// observed by other memory instructions executing in memory scope
293 /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
294 /// ordering is between address spaces. Returns true iff any
295 /// instructions inserted.
296 virtual bool insertWait(MachineBasicBlock::iterator &MI,
297 SIAtomicScope Scope,
298 SIAtomicAddrSpace AddrSpace,
299 SIMemOp Op,
300 bool IsCrossAddrSpaceOrdering,
301 Position Pos) const = 0;
302
303 /// Virtual destructor to allow derivations to be deleted.
304 virtual ~SICacheControl() = default;
305
306 };
307
308 class SIGfx6CacheControl : public SICacheControl {
309 protected:
310
311 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
312 /// is modified, false otherwise.
enableGLCBit(const MachineBasicBlock::iterator & MI) const313 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
314 return enableNamedBit<AMDGPU::OpName::glc>(MI);
315 }
316
317 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
318 /// is modified, false otherwise.
enableSLCBit(const MachineBasicBlock::iterator & MI) const319 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
320 return enableNamedBit<AMDGPU::OpName::slc>(MI);
321 }
322
323 public:
324
SIGfx6CacheControl(const GCNSubtarget & ST)325 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
326
327 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
328 SIAtomicScope Scope,
329 SIAtomicAddrSpace AddrSpace) const override;
330
331 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
332
333 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
334 SIAtomicScope Scope,
335 SIAtomicAddrSpace AddrSpace,
336 Position Pos) const override;
337
338 bool insertWait(MachineBasicBlock::iterator &MI,
339 SIAtomicScope Scope,
340 SIAtomicAddrSpace AddrSpace,
341 SIMemOp Op,
342 bool IsCrossAddrSpaceOrdering,
343 Position Pos) const override;
344 };
345
346 class SIGfx7CacheControl : public SIGfx6CacheControl {
347 public:
348
SIGfx7CacheControl(const GCNSubtarget & ST)349 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
350
351 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
352 SIAtomicScope Scope,
353 SIAtomicAddrSpace AddrSpace,
354 Position Pos) const override;
355
356 };
357
358 class SIMemoryLegalizer final : public MachineFunctionPass {
359 private:
360
361 /// Cache Control.
362 std::unique_ptr<SICacheControl> CC = nullptr;
363
364 /// List of atomic pseudo instructions.
365 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
366
367 /// Return true iff instruction \p MI is a atomic instruction that
368 /// returns a result.
isAtomicRet(const MachineInstr & MI) const369 bool isAtomicRet(const MachineInstr &MI) const {
370 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
371 }
372
373 /// Removes all processed atomic pseudo instructions from the current
374 /// function. Returns true if current function is modified, false otherwise.
375 bool removeAtomicPseudoMIs();
376
377 /// Expands load operation \p MI. Returns true if instructions are
378 /// added/deleted or \p MI is modified, false otherwise.
379 bool expandLoad(const SIMemOpInfo &MOI,
380 MachineBasicBlock::iterator &MI);
381 /// Expands store operation \p MI. Returns true if instructions are
382 /// added/deleted or \p MI is modified, false otherwise.
383 bool expandStore(const SIMemOpInfo &MOI,
384 MachineBasicBlock::iterator &MI);
385 /// Expands atomic fence operation \p MI. Returns true if
386 /// instructions are added/deleted or \p MI is modified, false otherwise.
387 bool expandAtomicFence(const SIMemOpInfo &MOI,
388 MachineBasicBlock::iterator &MI);
389 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
390 /// instructions are added/deleted or \p MI is modified, false otherwise.
391 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
392 MachineBasicBlock::iterator &MI);
393
394 public:
395 static char ID;
396
SIMemoryLegalizer()397 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
398
getAnalysisUsage(AnalysisUsage & AU) const399 void getAnalysisUsage(AnalysisUsage &AU) const override {
400 AU.setPreservesCFG();
401 MachineFunctionPass::getAnalysisUsage(AU);
402 }
403
getPassName() const404 StringRef getPassName() const override {
405 return PASS_NAME;
406 }
407
408 bool runOnMachineFunction(MachineFunction &MF) override;
409 };
410
411 } // end namespace anonymous
412
reportUnsupported(const MachineBasicBlock::iterator & MI,const char * Msg) const413 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
414 const char *Msg) const {
415 const Function &Func = MI->getParent()->getParent()->getFunction();
416 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
417 Func.getContext().diagnose(Diag);
418 }
419
420 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
toSIAtomicScope(SyncScope::ID SSID,SIAtomicAddrSpace InstrScope) const421 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
422 SIAtomicAddrSpace InstrScope) const {
423 /// TODO: For now assume OpenCL memory model which treats each
424 /// address space as having a separate happens-before relation, and
425 /// so an instruction only has ordering with respect to the address
426 /// space it accesses, and if it accesses multiple address spaces it
427 /// does not require ordering of operations in different address
428 /// spaces.
429 if (SSID == SyncScope::System)
430 return std::make_tuple(SIAtomicScope::SYSTEM,
431 SIAtomicAddrSpace::ATOMIC & InstrScope,
432 false);
433 if (SSID == MMI->getAgentSSID())
434 return std::make_tuple(SIAtomicScope::AGENT,
435 SIAtomicAddrSpace::ATOMIC & InstrScope,
436 false);
437 if (SSID == MMI->getWorkgroupSSID())
438 return std::make_tuple(SIAtomicScope::WORKGROUP,
439 SIAtomicAddrSpace::ATOMIC & InstrScope,
440 false);
441 if (SSID == MMI->getWavefrontSSID())
442 return std::make_tuple(SIAtomicScope::WAVEFRONT,
443 SIAtomicAddrSpace::ATOMIC & InstrScope,
444 false);
445 if (SSID == SyncScope::SingleThread)
446 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
447 SIAtomicAddrSpace::ATOMIC & InstrScope,
448 false);
449 /// TODO: To support HSA Memory Model need to add additional memory
450 /// scopes that specify that do require cross address space
451 /// ordering.
452 return None;
453 }
454
toSIAtomicAddrSpace(unsigned AS) const455 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
456 if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
457 return SIAtomicAddrSpace::FLAT;
458 if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
459 return SIAtomicAddrSpace::GLOBAL;
460 if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
461 return SIAtomicAddrSpace::LDS;
462 if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
463 return SIAtomicAddrSpace::SCRATCH;
464 if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
465 return SIAtomicAddrSpace::GDS;
466
467 return SIAtomicAddrSpace::OTHER;
468 }
469
SIMemOpAccess(MachineFunction & MF)470 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
471 SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
472 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
473 }
474
constructFromMIWithMMO(const MachineBasicBlock::iterator & MI) const475 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
476 const MachineBasicBlock::iterator &MI) const {
477 assert(MI->getNumMemOperands() > 0);
478
479 SyncScope::ID SSID = SyncScope::SingleThread;
480 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
481 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
482 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
483 bool IsNonTemporal = true;
484
485 // Validator should check whether or not MMOs cover the entire set of
486 // locations accessed by the memory instruction.
487 for (const auto &MMO : MI->memoperands()) {
488 IsNonTemporal &= MMO->isNonTemporal();
489 InstrAddrSpace |=
490 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
491 AtomicOrdering OpOrdering = MMO->getOrdering();
492 if (OpOrdering != AtomicOrdering::NotAtomic) {
493 const auto &IsSyncScopeInclusion =
494 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
495 if (!IsSyncScopeInclusion) {
496 reportUnsupported(MI,
497 "Unsupported non-inclusive atomic synchronization scope");
498 return None;
499 }
500
501 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
502 Ordering =
503 isStrongerThan(Ordering, OpOrdering) ?
504 Ordering : MMO->getOrdering();
505 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
506 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
507 FailureOrdering =
508 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
509 FailureOrdering : MMO->getFailureOrdering();
510 }
511 }
512
513 SIAtomicScope Scope = SIAtomicScope::NONE;
514 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
515 bool IsCrossAddressSpaceOrdering = false;
516 if (Ordering != AtomicOrdering::NotAtomic) {
517 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
518 if (!ScopeOrNone) {
519 reportUnsupported(MI, "Unsupported atomic synchronization scope");
520 return None;
521 }
522 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
523 ScopeOrNone.getValue();
524 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
525 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
526 reportUnsupported(MI, "Unsupported atomic address space");
527 return None;
528 }
529 }
530 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
531 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
532 }
533
getLoadInfo(const MachineBasicBlock::iterator & MI) const534 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
535 const MachineBasicBlock::iterator &MI) const {
536 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
537
538 if (!(MI->mayLoad() && !MI->mayStore()))
539 return None;
540
541 // Be conservative if there are no memory operands.
542 if (MI->getNumMemOperands() == 0)
543 return SIMemOpInfo();
544
545 return constructFromMIWithMMO(MI);
546 }
547
getStoreInfo(const MachineBasicBlock::iterator & MI) const548 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
549 const MachineBasicBlock::iterator &MI) const {
550 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
551
552 if (!(!MI->mayLoad() && MI->mayStore()))
553 return None;
554
555 // Be conservative if there are no memory operands.
556 if (MI->getNumMemOperands() == 0)
557 return SIMemOpInfo();
558
559 return constructFromMIWithMMO(MI);
560 }
561
getAtomicFenceInfo(const MachineBasicBlock::iterator & MI) const562 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
563 const MachineBasicBlock::iterator &MI) const {
564 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
565
566 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
567 return None;
568
569 AtomicOrdering Ordering =
570 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
571
572 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
573 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
574 if (!ScopeOrNone) {
575 reportUnsupported(MI, "Unsupported atomic synchronization scope");
576 return None;
577 }
578
579 SIAtomicScope Scope = SIAtomicScope::NONE;
580 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
581 bool IsCrossAddressSpaceOrdering = false;
582 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
583 ScopeOrNone.getValue();
584
585 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
586 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
587 reportUnsupported(MI, "Unsupported atomic address space");
588 return None;
589 }
590
591 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
592 IsCrossAddressSpaceOrdering);
593 }
594
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator & MI) const595 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
596 const MachineBasicBlock::iterator &MI) const {
597 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
598
599 if (!(MI->mayLoad() && MI->mayStore()))
600 return None;
601
602 // Be conservative if there are no memory operands.
603 if (MI->getNumMemOperands() == 0)
604 return SIMemOpInfo();
605
606 return constructFromMIWithMMO(MI);
607 }
608
SICacheControl(const GCNSubtarget & ST)609 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
610 TII = ST.getInstrInfo();
611 IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
612 }
613
614 /* static */
create(const GCNSubtarget & ST)615 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
616 GCNSubtarget::Generation Generation = ST.getGeneration();
617 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
618 return make_unique<SIGfx6CacheControl>(ST);
619 return make_unique<SIGfx7CacheControl>(ST);
620 }
621
enableLoadCacheBypass(const MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace) const622 bool SIGfx6CacheControl::enableLoadCacheBypass(
623 const MachineBasicBlock::iterator &MI,
624 SIAtomicScope Scope,
625 SIAtomicAddrSpace AddrSpace) const {
626 assert(MI->mayLoad() && !MI->mayStore());
627 bool Changed = false;
628
629 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
630 /// TODO: Do not set glc for rmw atomic operations as they
631 /// implicitly bypass the L1 cache.
632
633 switch (Scope) {
634 case SIAtomicScope::SYSTEM:
635 case SIAtomicScope::AGENT:
636 Changed |= enableGLCBit(MI);
637 break;
638 case SIAtomicScope::WORKGROUP:
639 case SIAtomicScope::WAVEFRONT:
640 case SIAtomicScope::SINGLETHREAD:
641 // No cache to bypass.
642 break;
643 default:
644 llvm_unreachable("Unsupported synchronization scope");
645 }
646 }
647
648 /// The scratch address space does not need the global memory caches
649 /// to be bypassed as all memory operations by the same thread are
650 /// sequentially consistent, and no other thread can access scratch
651 /// memory.
652
653 /// Other address spaces do not hava a cache.
654
655 return Changed;
656 }
657
enableNonTemporal(const MachineBasicBlock::iterator & MI) const658 bool SIGfx6CacheControl::enableNonTemporal(
659 const MachineBasicBlock::iterator &MI) const {
660 assert(MI->mayLoad() ^ MI->mayStore());
661 bool Changed = false;
662
663 /// TODO: Do not enableGLCBit if rmw atomic.
664 Changed |= enableGLCBit(MI);
665 Changed |= enableSLCBit(MI);
666
667 return Changed;
668 }
669
insertCacheInvalidate(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const670 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
671 SIAtomicScope Scope,
672 SIAtomicAddrSpace AddrSpace,
673 Position Pos) const {
674 bool Changed = false;
675
676 MachineBasicBlock &MBB = *MI->getParent();
677 DebugLoc DL = MI->getDebugLoc();
678
679 if (Pos == Position::AFTER)
680 ++MI;
681
682 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
683 switch (Scope) {
684 case SIAtomicScope::SYSTEM:
685 case SIAtomicScope::AGENT:
686 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
687 Changed = true;
688 break;
689 case SIAtomicScope::WORKGROUP:
690 case SIAtomicScope::WAVEFRONT:
691 case SIAtomicScope::SINGLETHREAD:
692 // No cache to invalidate.
693 break;
694 default:
695 llvm_unreachable("Unsupported synchronization scope");
696 }
697 }
698
699 /// The scratch address space does not need the global memory cache
700 /// to be flushed as all memory operations by the same thread are
701 /// sequentially consistent, and no other thread can access scratch
702 /// memory.
703
704 /// Other address spaces do not hava a cache.
705
706 if (Pos == Position::AFTER)
707 --MI;
708
709 return Changed;
710 }
711
insertWait(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,SIMemOp Op,bool IsCrossAddrSpaceOrdering,Position Pos) const712 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
713 SIAtomicScope Scope,
714 SIAtomicAddrSpace AddrSpace,
715 SIMemOp Op,
716 bool IsCrossAddrSpaceOrdering,
717 Position Pos) const {
718 bool Changed = false;
719
720 MachineBasicBlock &MBB = *MI->getParent();
721 DebugLoc DL = MI->getDebugLoc();
722
723 if (Pos == Position::AFTER)
724 ++MI;
725
726 bool VMCnt = false;
727 bool LGKMCnt = false;
728 bool EXPCnt = false;
729
730 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
731 switch (Scope) {
732 case SIAtomicScope::SYSTEM:
733 case SIAtomicScope::AGENT:
734 VMCnt = true;
735 break;
736 case SIAtomicScope::WORKGROUP:
737 case SIAtomicScope::WAVEFRONT:
738 case SIAtomicScope::SINGLETHREAD:
739 // The L1 cache keeps all memory operations in order for
740 // wavefronts in the same work-group.
741 break;
742 default:
743 llvm_unreachable("Unsupported synchronization scope");
744 }
745 }
746
747 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
748 switch (Scope) {
749 case SIAtomicScope::SYSTEM:
750 case SIAtomicScope::AGENT:
751 case SIAtomicScope::WORKGROUP:
752 // If no cross address space ordering then an LDS waitcnt is not
753 // needed as LDS operations for all waves are executed in a
754 // total global ordering as observed by all waves. Required if
755 // also synchronizing with global/GDS memory as LDS operations
756 // could be reordered with respect to later global/GDS memory
757 // operations of the same wave.
758 LGKMCnt = IsCrossAddrSpaceOrdering;
759 break;
760 case SIAtomicScope::WAVEFRONT:
761 case SIAtomicScope::SINGLETHREAD:
762 // The LDS keeps all memory operations in order for
763 // the same wavesfront.
764 break;
765 default:
766 llvm_unreachable("Unsupported synchronization scope");
767 }
768 }
769
770 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
771 switch (Scope) {
772 case SIAtomicScope::SYSTEM:
773 case SIAtomicScope::AGENT:
774 // If no cross address space ordering then an GDS waitcnt is not
775 // needed as GDS operations for all waves are executed in a
776 // total global ordering as observed by all waves. Required if
777 // also synchronizing with global/LDS memory as GDS operations
778 // could be reordered with respect to later global/LDS memory
779 // operations of the same wave.
780 EXPCnt = IsCrossAddrSpaceOrdering;
781 break;
782 case SIAtomicScope::WORKGROUP:
783 case SIAtomicScope::WAVEFRONT:
784 case SIAtomicScope::SINGLETHREAD:
785 // The GDS keeps all memory operations in order for
786 // the same work-group.
787 break;
788 default:
789 llvm_unreachable("Unsupported synchronization scope");
790 }
791 }
792
793 if (VMCnt || LGKMCnt || EXPCnt) {
794 unsigned WaitCntImmediate =
795 AMDGPU::encodeWaitcnt(IV,
796 VMCnt ? 0 : getVmcntBitMask(IV),
797 EXPCnt ? 0 : getExpcntBitMask(IV),
798 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
799 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
800 Changed = true;
801 }
802
803 if (Pos == Position::AFTER)
804 --MI;
805
806 return Changed;
807 }
808
insertCacheInvalidate(MachineBasicBlock::iterator & MI,SIAtomicScope Scope,SIAtomicAddrSpace AddrSpace,Position Pos) const809 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
810 SIAtomicScope Scope,
811 SIAtomicAddrSpace AddrSpace,
812 Position Pos) const {
813 bool Changed = false;
814
815 MachineBasicBlock &MBB = *MI->getParent();
816 DebugLoc DL = MI->getDebugLoc();
817
818 if (Pos == Position::AFTER)
819 ++MI;
820
821 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
822 switch (Scope) {
823 case SIAtomicScope::SYSTEM:
824 case SIAtomicScope::AGENT:
825 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
826 Changed = true;
827 break;
828 case SIAtomicScope::WORKGROUP:
829 case SIAtomicScope::WAVEFRONT:
830 case SIAtomicScope::SINGLETHREAD:
831 // No cache to invalidate.
832 break;
833 default:
834 llvm_unreachable("Unsupported synchronization scope");
835 }
836 }
837
838 /// The scratch address space does not need the global memory cache
839 /// to be flushed as all memory operations by the same thread are
840 /// sequentially consistent, and no other thread can access scratch
841 /// memory.
842
843 /// Other address spaces do not hava a cache.
844
845 if (Pos == Position::AFTER)
846 --MI;
847
848 return Changed;
849 }
850
removeAtomicPseudoMIs()851 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
852 if (AtomicPseudoMIs.empty())
853 return false;
854
855 for (auto &MI : AtomicPseudoMIs)
856 MI->eraseFromParent();
857
858 AtomicPseudoMIs.clear();
859 return true;
860 }
861
expandLoad(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)862 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
863 MachineBasicBlock::iterator &MI) {
864 assert(MI->mayLoad() && !MI->mayStore());
865
866 bool Changed = false;
867
868 if (MOI.isAtomic()) {
869 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
870 MOI.getOrdering() == AtomicOrdering::Acquire ||
871 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
872 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
873 MOI.getOrderingAddrSpace());
874 }
875
876 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
877 Changed |= CC->insertWait(MI, MOI.getScope(),
878 MOI.getOrderingAddrSpace(),
879 SIMemOp::LOAD | SIMemOp::STORE,
880 MOI.getIsCrossAddressSpaceOrdering(),
881 Position::BEFORE);
882
883 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
884 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
885 Changed |= CC->insertWait(MI, MOI.getScope(),
886 MOI.getInstrAddrSpace(),
887 SIMemOp::LOAD,
888 MOI.getIsCrossAddressSpaceOrdering(),
889 Position::AFTER);
890 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
891 MOI.getOrderingAddrSpace(),
892 Position::AFTER);
893 }
894
895 return Changed;
896 }
897
898 // Atomic instructions do not have the nontemporal attribute.
899 if (MOI.isNonTemporal()) {
900 Changed |= CC->enableNonTemporal(MI);
901 return Changed;
902 }
903
904 return Changed;
905 }
906
expandStore(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)907 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
908 MachineBasicBlock::iterator &MI) {
909 assert(!MI->mayLoad() && MI->mayStore());
910
911 bool Changed = false;
912
913 if (MOI.isAtomic()) {
914 if (MOI.getOrdering() == AtomicOrdering::Release ||
915 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
916 Changed |= CC->insertWait(MI, MOI.getScope(),
917 MOI.getOrderingAddrSpace(),
918 SIMemOp::LOAD | SIMemOp::STORE,
919 MOI.getIsCrossAddressSpaceOrdering(),
920 Position::BEFORE);
921
922 return Changed;
923 }
924
925 // Atomic instructions do not have the nontemporal attribute.
926 if (MOI.isNonTemporal()) {
927 Changed |= CC->enableNonTemporal(MI);
928 return Changed;
929 }
930
931 return Changed;
932 }
933
expandAtomicFence(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)934 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
935 MachineBasicBlock::iterator &MI) {
936 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
937
938 AtomicPseudoMIs.push_back(MI);
939 bool Changed = false;
940
941 if (MOI.isAtomic()) {
942 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
943 MOI.getOrdering() == AtomicOrdering::Release ||
944 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
945 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
946 /// TODO: This relies on a barrier always generating a waitcnt
947 /// for LDS to ensure it is not reordered with the completion of
948 /// the proceeding LDS operations. If barrier had a memory
949 /// ordering and memory scope, then library does not need to
950 /// generate a fence. Could add support in this file for
951 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
952 /// adding waitcnt before a S_BARRIER.
953 Changed |= CC->insertWait(MI, MOI.getScope(),
954 MOI.getOrderingAddrSpace(),
955 SIMemOp::LOAD | SIMemOp::STORE,
956 MOI.getIsCrossAddressSpaceOrdering(),
957 Position::BEFORE);
958
959 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
960 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
961 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
962 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
963 MOI.getOrderingAddrSpace(),
964 Position::BEFORE);
965
966 return Changed;
967 }
968
969 return Changed;
970 }
971
expandAtomicCmpxchgOrRmw(const SIMemOpInfo & MOI,MachineBasicBlock::iterator & MI)972 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
973 MachineBasicBlock::iterator &MI) {
974 assert(MI->mayLoad() && MI->mayStore());
975
976 bool Changed = false;
977
978 if (MOI.isAtomic()) {
979 if (MOI.getOrdering() == AtomicOrdering::Release ||
980 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
981 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
982 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
983 Changed |= CC->insertWait(MI, MOI.getScope(),
984 MOI.getOrderingAddrSpace(),
985 SIMemOp::LOAD | SIMemOp::STORE,
986 MOI.getIsCrossAddressSpaceOrdering(),
987 Position::BEFORE);
988
989 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
990 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
991 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
992 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
993 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
994 Changed |= CC->insertWait(MI, MOI.getScope(),
995 MOI.getOrderingAddrSpace(),
996 isAtomicRet(*MI) ? SIMemOp::LOAD :
997 SIMemOp::STORE,
998 MOI.getIsCrossAddressSpaceOrdering(),
999 Position::AFTER);
1000 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1001 MOI.getOrderingAddrSpace(),
1002 Position::AFTER);
1003 }
1004
1005 return Changed;
1006 }
1007
1008 return Changed;
1009 }
1010
runOnMachineFunction(MachineFunction & MF)1011 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1012 bool Changed = false;
1013
1014 SIMemOpAccess MOA(MF);
1015 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1016
1017 for (auto &MBB : MF) {
1018 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1019 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1020 continue;
1021
1022 if (const auto &MOI = MOA.getLoadInfo(MI))
1023 Changed |= expandLoad(MOI.getValue(), MI);
1024 else if (const auto &MOI = MOA.getStoreInfo(MI))
1025 Changed |= expandStore(MOI.getValue(), MI);
1026 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1027 Changed |= expandAtomicFence(MOI.getValue(), MI);
1028 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1029 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1030 }
1031 }
1032
1033 Changed |= removeAtomicPseudoMIs();
1034 return Changed;
1035 }
1036
1037 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1038
1039 char SIMemoryLegalizer::ID = 0;
1040 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1041
createSIMemoryLegalizerPass()1042 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1043 return new SIMemoryLegalizer();
1044 }
1045