1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass (at IR level) to replace atomic instructions with
11 // __atomic_* library calls, or target specific instruction which implement the
12 // same semantics in a way which better fits the target backend.  This can
13 // include the use of (intrinsic-based) load-linked/store-conditional loops,
14 // AtomicCmpXchg, or type coercions.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/CodeGen/AtomicExpandUtils.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstIterator.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/IR/Module.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Target/TargetLowering.h"
29 #include "llvm/Target/TargetMachine.h"
30 #include "llvm/Target/TargetSubtargetInfo.h"
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "atomic-expand"
35 
36 namespace {
37   class AtomicExpand: public FunctionPass {
38     const TargetMachine *TM;
39     const TargetLowering *TLI;
40   public:
41     static char ID; // Pass identification, replacement for typeid
AtomicExpand(const TargetMachine * TM=nullptr)42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
45     }
46 
47     bool runOnFunction(Function &F) override;
48 
49   private:
50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
51                                bool IsStore, bool IsLoad);
52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
54     bool tryExpandAtomicLoad(LoadInst *LI);
55     bool expandAtomicLoadToLL(LoadInst *LI);
56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
58     bool expandAtomicStore(StoreInst *SI);
59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
60     Value *
61     insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
62                       AtomicOrdering MemOpOrder,
63                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
64     void expandAtomicOpToLLSC(
65         Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
66         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
67     void expandPartwordAtomicRMW(
68         AtomicRMWInst *I,
69         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
70     void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
71 
72     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
73     static Value *insertRMWCmpXchgLoop(
74         IRBuilder<> &Builder, Type *ResultType, Value *Addr,
75         AtomicOrdering MemOpOrder,
76         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
77         CreateCmpXchgInstFun CreateCmpXchg);
78 
79     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
80     bool isIdempotentRMW(AtomicRMWInst *AI);
81     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
82 
83     bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
84                                  Value *PointerOperand, Value *ValueOperand,
85                                  Value *CASExpected, AtomicOrdering Ordering,
86                                  AtomicOrdering Ordering2,
87                                  ArrayRef<RTLIB::Libcall> Libcalls);
88     void expandAtomicLoadToLibcall(LoadInst *LI);
89     void expandAtomicStoreToLibcall(StoreInst *LI);
90     void expandAtomicRMWToLibcall(AtomicRMWInst *I);
91     void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
92 
93     friend bool
94     llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
95                                    CreateCmpXchgInstFun CreateCmpXchg);
96   };
97 }
98 
99 char AtomicExpand::ID = 0;
100 char &llvm::AtomicExpandID = AtomicExpand::ID;
101 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
102                    false, false)
103 
createAtomicExpandPass(const TargetMachine * TM)104 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
105   return new AtomicExpand(TM);
106 }
107 
108 namespace {
109 // Helper functions to retrieve the size of atomic instructions.
getAtomicOpSize(LoadInst * LI)110 unsigned getAtomicOpSize(LoadInst *LI) {
111   const DataLayout &DL = LI->getModule()->getDataLayout();
112   return DL.getTypeStoreSize(LI->getType());
113 }
114 
getAtomicOpSize(StoreInst * SI)115 unsigned getAtomicOpSize(StoreInst *SI) {
116   const DataLayout &DL = SI->getModule()->getDataLayout();
117   return DL.getTypeStoreSize(SI->getValueOperand()->getType());
118 }
119 
getAtomicOpSize(AtomicRMWInst * RMWI)120 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
121   const DataLayout &DL = RMWI->getModule()->getDataLayout();
122   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
123 }
124 
getAtomicOpSize(AtomicCmpXchgInst * CASI)125 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
126   const DataLayout &DL = CASI->getModule()->getDataLayout();
127   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
128 }
129 
130 // Helper functions to retrieve the alignment of atomic instructions.
getAtomicOpAlign(LoadInst * LI)131 unsigned getAtomicOpAlign(LoadInst *LI) {
132   unsigned Align = LI->getAlignment();
133   // In the future, if this IR restriction is relaxed, we should
134   // return DataLayout::getABITypeAlignment when there's no align
135   // value.
136   assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
137   return Align;
138 }
139 
getAtomicOpAlign(StoreInst * SI)140 unsigned getAtomicOpAlign(StoreInst *SI) {
141   unsigned Align = SI->getAlignment();
142   // In the future, if this IR restriction is relaxed, we should
143   // return DataLayout::getABITypeAlignment when there's no align
144   // value.
145   assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
146   return Align;
147 }
148 
getAtomicOpAlign(AtomicRMWInst * RMWI)149 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
150   // TODO(PR27168): This instruction has no alignment attribute, but unlike the
151   // default alignment for load/store, the default here is to assume
152   // it has NATURAL alignment, not DataLayout-specified alignment.
153   const DataLayout &DL = RMWI->getModule()->getDataLayout();
154   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
155 }
156 
getAtomicOpAlign(AtomicCmpXchgInst * CASI)157 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
158   // TODO(PR27168): same comment as above.
159   const DataLayout &DL = CASI->getModule()->getDataLayout();
160   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
161 }
162 
163 // Determine if a particular atomic operation has a supported size,
164 // and is of appropriate alignment, to be passed through for target
165 // lowering. (Versus turning into a __atomic libcall)
166 template <typename Inst>
atomicSizeSupported(const TargetLowering * TLI,Inst * I)167 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
168   unsigned Size = getAtomicOpSize(I);
169   unsigned Align = getAtomicOpAlign(I);
170   return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
171 }
172 
173 } // end anonymous namespace
174 
runOnFunction(Function & F)175 bool AtomicExpand::runOnFunction(Function &F) {
176   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
177     return false;
178   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
179 
180   SmallVector<Instruction *, 1> AtomicInsts;
181 
182   // Changing control-flow while iterating through it is a bad idea, so gather a
183   // list of all atomic instructions before we start.
184   for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
185     Instruction *I = &*II;
186     if (I->isAtomic() && !isa<FenceInst>(I))
187       AtomicInsts.push_back(I);
188   }
189 
190   bool MadeChange = false;
191   for (auto I : AtomicInsts) {
192     auto LI = dyn_cast<LoadInst>(I);
193     auto SI = dyn_cast<StoreInst>(I);
194     auto RMWI = dyn_cast<AtomicRMWInst>(I);
195     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
196     assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
197 
198     // If the Size/Alignment is not supported, replace with a libcall.
199     if (LI) {
200       if (!atomicSizeSupported(TLI, LI)) {
201         expandAtomicLoadToLibcall(LI);
202         MadeChange = true;
203         continue;
204       }
205     } else if (SI) {
206       if (!atomicSizeSupported(TLI, SI)) {
207         expandAtomicStoreToLibcall(SI);
208         MadeChange = true;
209         continue;
210       }
211     } else if (RMWI) {
212       if (!atomicSizeSupported(TLI, RMWI)) {
213         expandAtomicRMWToLibcall(RMWI);
214         MadeChange = true;
215         continue;
216       }
217     } else if (CASI) {
218       if (!atomicSizeSupported(TLI, CASI)) {
219         expandAtomicCASToLibcall(CASI);
220         MadeChange = true;
221         continue;
222       }
223     }
224 
225     if (TLI->shouldInsertFencesForAtomic(I)) {
226       auto FenceOrdering = AtomicOrdering::Monotonic;
227       bool IsStore, IsLoad;
228       if (LI && isAcquireOrStronger(LI->getOrdering())) {
229         FenceOrdering = LI->getOrdering();
230         LI->setOrdering(AtomicOrdering::Monotonic);
231         IsStore = false;
232         IsLoad = true;
233       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
234         FenceOrdering = SI->getOrdering();
235         SI->setOrdering(AtomicOrdering::Monotonic);
236         IsStore = true;
237         IsLoad = false;
238       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
239                           isAcquireOrStronger(RMWI->getOrdering()))) {
240         FenceOrdering = RMWI->getOrdering();
241         RMWI->setOrdering(AtomicOrdering::Monotonic);
242         IsStore = IsLoad = true;
243       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
244                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
245                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
246         // If a compare and swap is lowered to LL/SC, we can do smarter fence
247         // insertion, with a stronger one on the success path than on the
248         // failure path. As a result, fence insertion is directly done by
249         // expandAtomicCmpXchg in that case.
250         FenceOrdering = CASI->getSuccessOrdering();
251         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
252         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
253         IsStore = IsLoad = true;
254       }
255 
256       if (FenceOrdering != AtomicOrdering::Monotonic) {
257         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
258       }
259     }
260 
261     if (LI) {
262       if (LI->getType()->isFloatingPointTy()) {
263         // TODO: add a TLI hook to control this so that each target can
264         // convert to lowering the original type one at a time.
265         LI = convertAtomicLoadToIntegerType(LI);
266         assert(LI->getType()->isIntegerTy() && "invariant broken");
267         MadeChange = true;
268       }
269 
270       MadeChange |= tryExpandAtomicLoad(LI);
271     } else if (SI) {
272       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
273         // TODO: add a TLI hook to control this so that each target can
274         // convert to lowering the original type one at a time.
275         SI = convertAtomicStoreToIntegerType(SI);
276         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
277                "invariant broken");
278         MadeChange = true;
279       }
280 
281       if (TLI->shouldExpandAtomicStoreInIR(SI))
282         MadeChange |= expandAtomicStore(SI);
283     } else if (RMWI) {
284       // There are two different ways of expanding RMW instructions:
285       // - into a load if it is idempotent
286       // - into a Cmpxchg/LL-SC loop otherwise
287       // we try them in that order.
288 
289       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
290         MadeChange = true;
291       } else {
292         MadeChange |= tryExpandAtomicRMW(RMWI);
293       }
294     } else if (CASI) {
295       // TODO: when we're ready to make the change at the IR level, we can
296       // extend convertCmpXchgToInteger for floating point too.
297       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
298              "unimplemented - floating point not legal at IR level");
299       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
300         // TODO: add a TLI hook to control this so that each target can
301         // convert to lowering the original type one at a time.
302         CASI = convertCmpXchgToIntegerType(CASI);
303         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
304                "invariant broken");
305         MadeChange = true;
306       }
307 
308       unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
309       unsigned ValueSize = getAtomicOpSize(CASI);
310       if (ValueSize < MinCASSize) {
311         assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
312                "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
313         expandPartwordCmpXchg(CASI);
314       } else {
315         if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
316           MadeChange |= expandAtomicCmpXchg(CASI);
317       }
318     }
319   }
320   return MadeChange;
321 }
322 
bracketInstWithFences(Instruction * I,AtomicOrdering Order,bool IsStore,bool IsLoad)323 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
324                                          bool IsStore, bool IsLoad) {
325   IRBuilder<> Builder(I);
326 
327   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
328 
329   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
330   // The trailing fence is emitted before the instruction instead of after
331   // because there is no easy way of setting Builder insertion point after
332   // an instruction. So we must erase it from the BB, and insert it back
333   // in the right place.
334   // We have a guard here because not every atomic operation generates a
335   // trailing fence.
336   if (TrailingFence) {
337     TrailingFence->removeFromParent();
338     TrailingFence->insertAfter(I);
339   }
340 
341   return (LeadingFence || TrailingFence);
342 }
343 
344 /// Get the iX type with the same bitwidth as T.
getCorrespondingIntegerType(Type * T,const DataLayout & DL)345 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
346                                                        const DataLayout &DL) {
347   EVT VT = TLI->getValueType(DL, T);
348   unsigned BitWidth = VT.getStoreSizeInBits();
349   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
350   return IntegerType::get(T->getContext(), BitWidth);
351 }
352 
353 /// Convert an atomic load of a non-integral type to an integer load of the
354 /// equivalent bitwidth.  See the function comment on
355 /// convertAtomicStoreToIntegerType for background.
convertAtomicLoadToIntegerType(LoadInst * LI)356 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
357   auto *M = LI->getModule();
358   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
359                                             M->getDataLayout());
360 
361   IRBuilder<> Builder(LI);
362 
363   Value *Addr = LI->getPointerOperand();
364   Type *PT = PointerType::get(NewTy,
365                               Addr->getType()->getPointerAddressSpace());
366   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
367 
368   auto *NewLI = Builder.CreateLoad(NewAddr);
369   NewLI->setAlignment(LI->getAlignment());
370   NewLI->setVolatile(LI->isVolatile());
371   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
372   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
373 
374   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
375   LI->replaceAllUsesWith(NewVal);
376   LI->eraseFromParent();
377   return NewLI;
378 }
379 
tryExpandAtomicLoad(LoadInst * LI)380 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
381   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
382   case TargetLoweringBase::AtomicExpansionKind::None:
383     return false;
384   case TargetLoweringBase::AtomicExpansionKind::LLSC:
385     expandAtomicOpToLLSC(
386         LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
387         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
388     return true;
389   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
390     return expandAtomicLoadToLL(LI);
391   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
392     return expandAtomicLoadToCmpXchg(LI);
393   }
394   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
395 }
396 
expandAtomicLoadToLL(LoadInst * LI)397 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
398   IRBuilder<> Builder(LI);
399 
400   // On some architectures, load-linked instructions are atomic for larger
401   // sizes than normal loads. For example, the only 64-bit load guaranteed
402   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
403   Value *Val =
404       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
405   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
406 
407   LI->replaceAllUsesWith(Val);
408   LI->eraseFromParent();
409 
410   return true;
411 }
412 
expandAtomicLoadToCmpXchg(LoadInst * LI)413 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
414   IRBuilder<> Builder(LI);
415   AtomicOrdering Order = LI->getOrdering();
416   Value *Addr = LI->getPointerOperand();
417   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
418   Constant *DummyVal = Constant::getNullValue(Ty);
419 
420   Value *Pair = Builder.CreateAtomicCmpXchg(
421       Addr, DummyVal, DummyVal, Order,
422       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
423   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
424 
425   LI->replaceAllUsesWith(Loaded);
426   LI->eraseFromParent();
427 
428   return true;
429 }
430 
431 /// Convert an atomic store of a non-integral type to an integer store of the
432 /// equivalent bitwidth.  We used to not support floating point or vector
433 /// atomics in the IR at all.  The backends learned to deal with the bitcast
434 /// idiom because that was the only way of expressing the notion of a atomic
435 /// float or vector store.  The long term plan is to teach each backend to
436 /// instruction select from the original atomic store, but as a migration
437 /// mechanism, we convert back to the old format which the backends understand.
438 /// Each backend will need individual work to recognize the new format.
convertAtomicStoreToIntegerType(StoreInst * SI)439 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
440   IRBuilder<> Builder(SI);
441   auto *M = SI->getModule();
442   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
443                                             M->getDataLayout());
444   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
445 
446   Value *Addr = SI->getPointerOperand();
447   Type *PT = PointerType::get(NewTy,
448                               Addr->getType()->getPointerAddressSpace());
449   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
450 
451   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
452   NewSI->setAlignment(SI->getAlignment());
453   NewSI->setVolatile(SI->isVolatile());
454   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
455   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
456   SI->eraseFromParent();
457   return NewSI;
458 }
459 
expandAtomicStore(StoreInst * SI)460 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
461   // This function is only called on atomic stores that are too large to be
462   // atomic if implemented as a native store. So we replace them by an
463   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
464   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
465   // It is the responsibility of the target to only signal expansion via
466   // shouldExpandAtomicRMW in cases where this is required and possible.
467   IRBuilder<> Builder(SI);
468   AtomicRMWInst *AI =
469       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
470                               SI->getValueOperand(), SI->getOrdering());
471   SI->eraseFromParent();
472 
473   // Now we have an appropriate swap instruction, lower it as usual.
474   return tryExpandAtomicRMW(AI);
475 }
476 
createCmpXchgInstFun(IRBuilder<> & Builder,Value * Addr,Value * Loaded,Value * NewVal,AtomicOrdering MemOpOrder,Value * & Success,Value * & NewLoaded)477 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
478                                  Value *Loaded, Value *NewVal,
479                                  AtomicOrdering MemOpOrder,
480                                  Value *&Success, Value *&NewLoaded) {
481   Value* Pair = Builder.CreateAtomicCmpXchg(
482       Addr, Loaded, NewVal, MemOpOrder,
483       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
484   Success = Builder.CreateExtractValue(Pair, 1, "success");
485   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
486 }
487 
488 /// Emit IR to implement the given atomicrmw operation on values in registers,
489 /// returning the new value.
performAtomicOp(AtomicRMWInst::BinOp Op,IRBuilder<> & Builder,Value * Loaded,Value * Inc)490 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
491                               Value *Loaded, Value *Inc) {
492   Value *NewVal;
493   switch (Op) {
494   case AtomicRMWInst::Xchg:
495     return Inc;
496   case AtomicRMWInst::Add:
497     return Builder.CreateAdd(Loaded, Inc, "new");
498   case AtomicRMWInst::Sub:
499     return Builder.CreateSub(Loaded, Inc, "new");
500   case AtomicRMWInst::And:
501     return Builder.CreateAnd(Loaded, Inc, "new");
502   case AtomicRMWInst::Nand:
503     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
504   case AtomicRMWInst::Or:
505     return Builder.CreateOr(Loaded, Inc, "new");
506   case AtomicRMWInst::Xor:
507     return Builder.CreateXor(Loaded, Inc, "new");
508   case AtomicRMWInst::Max:
509     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
510     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
511   case AtomicRMWInst::Min:
512     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
513     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
514   case AtomicRMWInst::UMax:
515     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
516     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
517   case AtomicRMWInst::UMin:
518     NewVal = Builder.CreateICmpULE(Loaded, Inc);
519     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
520   default:
521     llvm_unreachable("Unknown atomic op");
522   }
523 }
524 
tryExpandAtomicRMW(AtomicRMWInst * AI)525 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
526   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
527   case TargetLoweringBase::AtomicExpansionKind::None:
528     return false;
529   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
530     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
531     unsigned ValueSize = getAtomicOpSize(AI);
532     if (ValueSize < MinCASSize) {
533       llvm_unreachable(
534           "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
535     } else {
536       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
537         return performAtomicOp(AI->getOperation(), Builder, Loaded,
538                                AI->getValOperand());
539       };
540       expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
541                            AI->getOrdering(), PerformOp);
542     }
543     return true;
544   }
545   case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
546     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
547     unsigned ValueSize = getAtomicOpSize(AI);
548     if (ValueSize < MinCASSize) {
549       expandPartwordAtomicRMW(AI,
550                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
551     } else {
552       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
553     }
554     return true;
555   }
556   default:
557     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
558   }
559 }
560 
561 namespace {
562 
563 /// Result values from createMaskInstrs helper.
564 struct PartwordMaskValues {
565   Type *WordType;
566   Type *ValueType;
567   Value *AlignedAddr;
568   Value *ShiftAmt;
569   Value *Mask;
570   Value *Inv_Mask;
571 };
572 } // end anonymous namespace
573 
574 /// This is a helper function which builds instructions to provide
575 /// values necessary for partword atomic operations. It takes an
576 /// incoming address, Addr, and ValueType, and constructs the address,
577 /// shift-amounts and masks needed to work with a larger value of size
578 /// WordSize.
579 ///
580 /// AlignedAddr: Addr rounded down to a multiple of WordSize
581 ///
582 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded
583 ///           from AlignAddr for it to have the same value as if
584 ///           ValueType was loaded from Addr.
585 ///
586 /// Mask: Value to mask with the value loaded from AlignAddr to
587 ///       include only the part that would've been loaded from Addr.
588 ///
589 /// Inv_Mask: The inverse of Mask.
590 
createMaskInstrs(IRBuilder<> & Builder,Instruction * I,Type * ValueType,Value * Addr,unsigned WordSize)591 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
592                                            Type *ValueType, Value *Addr,
593                                            unsigned WordSize) {
594   PartwordMaskValues Ret;
595 
596   BasicBlock *BB = I->getParent();
597   Function *F = BB->getParent();
598   Module *M = I->getModule();
599 
600   LLVMContext &Ctx = F->getContext();
601   const DataLayout &DL = M->getDataLayout();
602 
603   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
604 
605   assert(ValueSize < WordSize);
606 
607   Ret.ValueType = ValueType;
608   Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
609 
610   Type *WordPtrType =
611       Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
612 
613   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
614   Ret.AlignedAddr = Builder.CreateIntToPtr(
615       Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
616       "AlignedAddr");
617 
618   Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
619   if (DL.isLittleEndian()) {
620     // turn bytes into bits
621     Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
622   } else {
623     // turn bytes into bits, and count from the other side.
624     Ret.ShiftAmt =
625         Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
626   }
627 
628   Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
629   Ret.Mask = Builder.CreateShl(
630       ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
631       "Mask");
632   Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
633 
634   return Ret;
635 }
636 
637 /// Emit IR to implement a masked version of a given atomicrmw
638 /// operation. (That is, only the bits under the Mask should be
639 /// affected by the operation)
performMaskedAtomicOp(AtomicRMWInst::BinOp Op,IRBuilder<> & Builder,Value * Loaded,Value * Shifted_Inc,Value * Inc,const PartwordMaskValues & PMV)640 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
641                                     IRBuilder<> &Builder, Value *Loaded,
642                                     Value *Shifted_Inc, Value *Inc,
643                                     const PartwordMaskValues &PMV) {
644   switch (Op) {
645   case AtomicRMWInst::Xchg: {
646     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
647     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
648     return FinalVal;
649   }
650   case AtomicRMWInst::Or:
651   case AtomicRMWInst::Xor:
652     // Or/Xor won't affect any other bits, so can just be done
653     // directly.
654     return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
655   case AtomicRMWInst::Add:
656   case AtomicRMWInst::Sub:
657   case AtomicRMWInst::And:
658   case AtomicRMWInst::Nand: {
659     // The other arithmetic ops need to be masked into place.
660     Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
661     Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
662     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
663     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
664     return FinalVal;
665   }
666   case AtomicRMWInst::Max:
667   case AtomicRMWInst::Min:
668   case AtomicRMWInst::UMax:
669   case AtomicRMWInst::UMin: {
670     // Finally, comparison ops will operate on the full value, so
671     // truncate down to the original size, and expand out again after
672     // doing the operation.
673     Value *Loaded_Shiftdown = Builder.CreateTrunc(
674         Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
675     Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
676     Value *NewVal_Shiftup = Builder.CreateShl(
677         Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
678     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
679     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
680     return FinalVal;
681   }
682   default:
683     llvm_unreachable("Unknown atomic op");
684   }
685 }
686 
687 /// Expand a sub-word atomicrmw operation into an appropriate
688 /// word-sized operation.
689 ///
690 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
691 /// way as a typical atomicrmw expansion. The only difference here is
692 /// that the operation inside of the loop must operate only upon a
693 /// part of the value.
expandPartwordAtomicRMW(AtomicRMWInst * AI,TargetLoweringBase::AtomicExpansionKind ExpansionKind)694 void AtomicExpand::expandPartwordAtomicRMW(
695     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
696 
697   assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
698 
699   AtomicOrdering MemOpOrder = AI->getOrdering();
700 
701   IRBuilder<> Builder(AI);
702 
703   PartwordMaskValues PMV =
704       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
705                        TLI->getMinCmpXchgSizeInBits() / 8);
706 
707   Value *ValOperand_Shifted =
708       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
709                         PMV.ShiftAmt, "ValOperand_Shifted");
710 
711   auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
712     return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
713                                  ValOperand_Shifted, AI->getValOperand(), PMV);
714   };
715 
716   // TODO: When we're ready to support LLSC conversions too, use
717   // insertRMWLLSCLoop here for ExpansionKind==LLSC.
718   Value *OldResult =
719       insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
720                            PerformPartwordOp, createCmpXchgInstFun);
721   Value *FinalOldResult = Builder.CreateTrunc(
722       Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
723   AI->replaceAllUsesWith(FinalOldResult);
724   AI->eraseFromParent();
725 }
726 
expandPartwordCmpXchg(AtomicCmpXchgInst * CI)727 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
728   // The basic idea here is that we're expanding a cmpxchg of a
729   // smaller memory size up to a word-sized cmpxchg. To do this, we
730   // need to add a retry-loop for strong cmpxchg, so that
731   // modifications to other parts of the word don't cause a spurious
732   // failure.
733 
734   // This generates code like the following:
735   //     [[Setup mask values PMV.*]]
736   //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
737   //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
738   //     %InitLoaded = load i32* %addr
739   //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
740   //     br partword.cmpxchg.loop
741   // partword.cmpxchg.loop:
742   //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
743   //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
744   //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
745   //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
746   //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
747   //        i32 %FullWord_NewVal success_ordering failure_ordering
748   //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
749   //     %Success = extractvalue { i32, i1 } %NewCI, 1
750   //     br i1 %Success, label %partword.cmpxchg.end,
751   //        label %partword.cmpxchg.failure
752   // partword.cmpxchg.failure:
753   //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
754   //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
755   //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
756   //         label %partword.cmpxchg.end
757   // partword.cmpxchg.end:
758   //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
759   //    %FinalOldVal = trunc i32 %tmp1 to i8
760   //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
761   //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
762 
763   Value *Addr = CI->getPointerOperand();
764   Value *Cmp = CI->getCompareOperand();
765   Value *NewVal = CI->getNewValOperand();
766 
767   BasicBlock *BB = CI->getParent();
768   Function *F = BB->getParent();
769   IRBuilder<> Builder(CI);
770   LLVMContext &Ctx = Builder.getContext();
771 
772   const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
773 
774   BasicBlock *EndBB =
775       BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
776   auto FailureBB =
777       BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
778   auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
779 
780   // The split call above "helpfully" added a branch at the end of BB
781   // (to the wrong place).
782   std::prev(BB->end())->eraseFromParent();
783   Builder.SetInsertPoint(BB);
784 
785   PartwordMaskValues PMV = createMaskInstrs(
786       Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
787 
788   // Shift the incoming values over, into the right location in the word.
789   Value *NewVal_Shifted =
790       Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
791   Value *Cmp_Shifted =
792       Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
793 
794   // Load the entire current word, and mask into place the expected and new
795   // values
796   LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
797   InitLoaded->setVolatile(CI->isVolatile());
798   Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
799   Builder.CreateBr(LoopBB);
800 
801   // partword.cmpxchg.loop:
802   Builder.SetInsertPoint(LoopBB);
803   PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
804   Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
805 
806   // Mask/Or the expected and new values into place in the loaded word.
807   Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
808   Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
809   AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
810       PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
811       CI->getFailureOrdering(), CI->getSynchScope());
812   NewCI->setVolatile(CI->isVolatile());
813   // When we're building a strong cmpxchg, we need a loop, so you
814   // might think we could use a weak cmpxchg inside. But, using strong
815   // allows the below comparison for ShouldContinue, and we're
816   // expecting the underlying cmpxchg to be a machine instruction,
817   // which is strong anyways.
818   NewCI->setWeak(CI->isWeak());
819 
820   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
821   Value *Success = Builder.CreateExtractValue(NewCI, 1);
822 
823   if (CI->isWeak())
824     Builder.CreateBr(EndBB);
825   else
826     Builder.CreateCondBr(Success, EndBB, FailureBB);
827 
828   // partword.cmpxchg.failure:
829   Builder.SetInsertPoint(FailureBB);
830   // Upon failure, verify that the masked-out part of the loaded value
831   // has been modified.  If it didn't, abort the cmpxchg, since the
832   // masked-in part must've.
833   Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
834   Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
835   Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
836 
837   // Add the second value to the phi from above
838   Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
839 
840   // partword.cmpxchg.end:
841   Builder.SetInsertPoint(CI);
842 
843   Value *FinalOldVal = Builder.CreateTrunc(
844       Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
845   Value *Res = UndefValue::get(CI->getType());
846   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
847   Res = Builder.CreateInsertValue(Res, Success, 1);
848 
849   CI->replaceAllUsesWith(Res);
850   CI->eraseFromParent();
851 }
852 
expandAtomicOpToLLSC(Instruction * I,Type * ResultType,Value * Addr,AtomicOrdering MemOpOrder,function_ref<Value * (IRBuilder<> &,Value *)> PerformOp)853 void AtomicExpand::expandAtomicOpToLLSC(
854     Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
855     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
856   IRBuilder<> Builder(I);
857   Value *Loaded =
858       insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
859 
860   I->replaceAllUsesWith(Loaded);
861   I->eraseFromParent();
862 }
863 
insertRMWLLSCLoop(IRBuilder<> & Builder,Type * ResultTy,Value * Addr,AtomicOrdering MemOpOrder,function_ref<Value * (IRBuilder<> &,Value *)> PerformOp)864 Value *AtomicExpand::insertRMWLLSCLoop(
865     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
866     AtomicOrdering MemOpOrder,
867     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
868   LLVMContext &Ctx = Builder.getContext();
869   BasicBlock *BB = Builder.GetInsertBlock();
870   Function *F = BB->getParent();
871 
872   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
873   //
874   // The standard expansion we produce is:
875   //     [...]
876   // atomicrmw.start:
877   //     %loaded = @load.linked(%addr)
878   //     %new = some_op iN %loaded, %incr
879   //     %stored = @store_conditional(%new, %addr)
880   //     %try_again = icmp i32 ne %stored, 0
881   //     br i1 %try_again, label %loop, label %atomicrmw.end
882   // atomicrmw.end:
883   //     [...]
884   BasicBlock *ExitBB =
885       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
886   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
887 
888   // The split call above "helpfully" added a branch at the end of BB (to the
889   // wrong place).
890   std::prev(BB->end())->eraseFromParent();
891   Builder.SetInsertPoint(BB);
892   Builder.CreateBr(LoopBB);
893 
894   // Start the main loop block now that we've taken care of the preliminaries.
895   Builder.SetInsertPoint(LoopBB);
896   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
897 
898   Value *NewVal = PerformOp(Builder, Loaded);
899 
900   Value *StoreSuccess =
901       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
902   Value *TryAgain = Builder.CreateICmpNE(
903       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
904   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
905 
906   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
907   return Loaded;
908 }
909 
910 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
911 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
912 /// IR.  As a migration step, we convert back to what use to be the standard
913 /// way to represent a pointer cmpxchg so that we can update backends one by
914 /// one.
convertCmpXchgToIntegerType(AtomicCmpXchgInst * CI)915 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
916   auto *M = CI->getModule();
917   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
918                                             M->getDataLayout());
919 
920   IRBuilder<> Builder(CI);
921 
922   Value *Addr = CI->getPointerOperand();
923   Type *PT = PointerType::get(NewTy,
924                               Addr->getType()->getPointerAddressSpace());
925   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
926 
927   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
928   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
929 
930 
931   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
932                                             CI->getSuccessOrdering(),
933                                             CI->getFailureOrdering(),
934                                             CI->getSynchScope());
935   NewCI->setVolatile(CI->isVolatile());
936   NewCI->setWeak(CI->isWeak());
937   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
938 
939   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
940   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
941 
942   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
943 
944   Value *Res = UndefValue::get(CI->getType());
945   Res = Builder.CreateInsertValue(Res, OldVal, 0);
946   Res = Builder.CreateInsertValue(Res, Succ, 1);
947 
948   CI->replaceAllUsesWith(Res);
949   CI->eraseFromParent();
950   return NewCI;
951 }
952 
953 
expandAtomicCmpXchg(AtomicCmpXchgInst * CI)954 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
955   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
956   AtomicOrdering FailureOrder = CI->getFailureOrdering();
957   Value *Addr = CI->getPointerOperand();
958   BasicBlock *BB = CI->getParent();
959   Function *F = BB->getParent();
960   LLVMContext &Ctx = F->getContext();
961   // If shouldInsertFencesForAtomic() returns true, then the target does not
962   // want to deal with memory orders, and emitLeading/TrailingFence should take
963   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
964   // should preserve the ordering.
965   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
966   AtomicOrdering MemOpOrder =
967       ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
968 
969   // In implementations which use a barrier to achieve release semantics, we can
970   // delay emitting this barrier until we know a store is actually going to be
971   // attempted. The cost of this delay is that we need 2 copies of the block
972   // emitting the load-linked, affecting code size.
973   //
974   // Ideally, this logic would be unconditional except for the minsize check
975   // since in other cases the extra blocks naturally collapse down to the
976   // minimal loop. Unfortunately, this puts too much stress on later
977   // optimisations so we avoid emitting the extra logic in those cases too.
978   bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
979                            SuccessOrder != AtomicOrdering::Monotonic &&
980                            SuccessOrder != AtomicOrdering::Acquire &&
981                            !F->optForMinSize();
982 
983   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
984   // do it even on minsize.
985   bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
986 
987   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
988   //
989   // The full expansion we produce is:
990   //     [...]
991   // cmpxchg.start:
992   //     %unreleasedload = @load.linked(%addr)
993   //     %should_store = icmp eq %unreleasedload, %desired
994   //     br i1 %should_store, label %cmpxchg.fencedstore,
995   //                          label %cmpxchg.nostore
996   // cmpxchg.releasingstore:
997   //     fence?
998   //     br label cmpxchg.trystore
999   // cmpxchg.trystore:
1000   //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
1001   //                            [%releasedload, %cmpxchg.releasedload]
1002   //     %stored = @store_conditional(%new, %addr)
1003   //     %success = icmp eq i32 %stored, 0
1004   //     br i1 %success, label %cmpxchg.success,
1005   //                     label %cmpxchg.releasedload/%cmpxchg.failure
1006   // cmpxchg.releasedload:
1007   //     %releasedload = @load.linked(%addr)
1008   //     %should_store = icmp eq %releasedload, %desired
1009   //     br i1 %should_store, label %cmpxchg.trystore,
1010   //                          label %cmpxchg.failure
1011   // cmpxchg.success:
1012   //     fence?
1013   //     br label %cmpxchg.end
1014   // cmpxchg.nostore:
1015   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1016   //                           [%releasedload,
1017   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
1018   //     @load_linked_fail_balance()?
1019   //     br label %cmpxchg.failure
1020   // cmpxchg.failure:
1021   //     fence?
1022   //     br label %cmpxchg.end
1023   // cmpxchg.end:
1024   //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
1025   //                   [%loaded.trystore, %cmpxchg.trystore]
1026   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1027   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1028   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1029   //     [...]
1030   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
1031   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
1032   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
1033   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
1034   auto ReleasedLoadBB =
1035       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
1036   auto TryStoreBB =
1037       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
1038   auto ReleasingStoreBB =
1039       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
1040   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
1041 
1042   // This grabs the DebugLoc from CI
1043   IRBuilder<> Builder(CI);
1044 
1045   // The split call above "helpfully" added a branch at the end of BB (to the
1046   // wrong place), but we might want a fence too. It's easiest to just remove
1047   // the branch entirely.
1048   std::prev(BB->end())->eraseFromParent();
1049   Builder.SetInsertPoint(BB);
1050   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1051     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
1052                           /*IsLoad=*/true);
1053   Builder.CreateBr(StartBB);
1054 
1055   // Start the main loop block now that we've taken care of the preliminaries.
1056   Builder.SetInsertPoint(StartBB);
1057   Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1058   Value *ShouldStore = Builder.CreateICmpEQ(
1059       UnreleasedLoad, CI->getCompareOperand(), "should_store");
1060 
1061   // If the cmpxchg doesn't actually need any ordering when it fails, we can
1062   // jump straight past that fence instruction (if it exists).
1063   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
1064 
1065   Builder.SetInsertPoint(ReleasingStoreBB);
1066   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1067     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
1068                           /*IsLoad=*/true);
1069   Builder.CreateBr(TryStoreBB);
1070 
1071   Builder.SetInsertPoint(TryStoreBB);
1072   Value *StoreSuccess = TLI->emitStoreConditional(
1073       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
1074   StoreSuccess = Builder.CreateICmpEQ(
1075       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
1076   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1077   Builder.CreateCondBr(StoreSuccess, SuccessBB,
1078                        CI->isWeak() ? FailureBB : RetryBB);
1079 
1080   Builder.SetInsertPoint(ReleasedLoadBB);
1081   Value *SecondLoad;
1082   if (HasReleasedLoadBB) {
1083     SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1084     ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
1085                                        "should_store");
1086 
1087     // If the cmpxchg doesn't actually need any ordering when it fails, we can
1088     // jump straight past that fence instruction (if it exists).
1089     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
1090   } else
1091     Builder.CreateUnreachable();
1092 
1093   // Make sure later instructions don't get reordered with a fence if
1094   // necessary.
1095   Builder.SetInsertPoint(SuccessBB);
1096   if (ShouldInsertFencesForAtomic)
1097     TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
1098                            /*IsLoad=*/true);
1099   Builder.CreateBr(ExitBB);
1100 
1101   Builder.SetInsertPoint(NoStoreBB);
1102   // In the failing case, where we don't execute the store-conditional, the
1103   // target might want to balance out the load-linked with a dedicated
1104   // instruction (e.g., on ARM, clearing the exclusive monitor).
1105   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1106   Builder.CreateBr(FailureBB);
1107 
1108   Builder.SetInsertPoint(FailureBB);
1109   if (ShouldInsertFencesForAtomic)
1110     TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
1111                            /*IsLoad=*/true);
1112   Builder.CreateBr(ExitBB);
1113 
1114   // Finally, we have control-flow based knowledge of whether the cmpxchg
1115   // succeeded or not. We expose this to later passes by converting any
1116   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1117   // PHI.
1118   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1119   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
1120   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
1121   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
1122 
1123   // Setup the builder so we can create any PHIs we need.
1124   Value *Loaded;
1125   if (!HasReleasedLoadBB)
1126     Loaded = UnreleasedLoad;
1127   else {
1128     Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
1129     PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1130     TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
1131     TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1132 
1133     Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
1134     PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1135     NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
1136     NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1137 
1138     Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
1139     PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1140     ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
1141     ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
1142 
1143     Loaded = ExitLoaded;
1144   }
1145 
1146   // Look for any users of the cmpxchg that are just comparing the loaded value
1147   // against the desired one, and replace them with the CFG-derived version.
1148   SmallVector<ExtractValueInst *, 2> PrunedInsts;
1149   for (auto User : CI->users()) {
1150     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
1151     if (!EV)
1152       continue;
1153 
1154     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
1155            "weird extraction from { iN, i1 }");
1156 
1157     if (EV->getIndices()[0] == 0)
1158       EV->replaceAllUsesWith(Loaded);
1159     else
1160       EV->replaceAllUsesWith(Success);
1161 
1162     PrunedInsts.push_back(EV);
1163   }
1164 
1165   // We can remove the instructions now we're no longer iterating through them.
1166   for (auto EV : PrunedInsts)
1167     EV->eraseFromParent();
1168 
1169   if (!CI->use_empty()) {
1170     // Some use of the full struct return that we don't understand has happened,
1171     // so we've got to reconstruct it properly.
1172     Value *Res;
1173     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
1174     Res = Builder.CreateInsertValue(Res, Success, 1);
1175 
1176     CI->replaceAllUsesWith(Res);
1177   }
1178 
1179   CI->eraseFromParent();
1180   return true;
1181 }
1182 
isIdempotentRMW(AtomicRMWInst * RMWI)1183 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
1184   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
1185   if(!C)
1186     return false;
1187 
1188   AtomicRMWInst::BinOp Op = RMWI->getOperation();
1189   switch(Op) {
1190     case AtomicRMWInst::Add:
1191     case AtomicRMWInst::Sub:
1192     case AtomicRMWInst::Or:
1193     case AtomicRMWInst::Xor:
1194       return C->isZero();
1195     case AtomicRMWInst::And:
1196       return C->isMinusOne();
1197     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
1198     default:
1199       return false;
1200   }
1201 }
1202 
simplifyIdempotentRMW(AtomicRMWInst * RMWI)1203 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
1204   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1205     tryExpandAtomicLoad(ResultingLoad);
1206     return true;
1207   }
1208   return false;
1209 }
1210 
insertRMWCmpXchgLoop(IRBuilder<> & Builder,Type * ResultTy,Value * Addr,AtomicOrdering MemOpOrder,function_ref<Value * (IRBuilder<> &,Value *)> PerformOp,CreateCmpXchgInstFun CreateCmpXchg)1211 Value *AtomicExpand::insertRMWCmpXchgLoop(
1212     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
1213     AtomicOrdering MemOpOrder,
1214     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
1215     CreateCmpXchgInstFun CreateCmpXchg) {
1216   LLVMContext &Ctx = Builder.getContext();
1217   BasicBlock *BB = Builder.GetInsertBlock();
1218   Function *F = BB->getParent();
1219 
1220   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
1221   //
1222   // The standard expansion we produce is:
1223   //     [...]
1224   //     %init_loaded = load atomic iN* %addr
1225   //     br label %loop
1226   // loop:
1227   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1228   //     %new = some_op iN %loaded, %incr
1229   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
1230   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
1231   //     %success = extractvalue { iN, i1 } %pair, 1
1232   //     br i1 %success, label %atomicrmw.end, label %loop
1233   // atomicrmw.end:
1234   //     [...]
1235   BasicBlock *ExitBB =
1236       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
1237   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
1238 
1239   // The split call above "helpfully" added a branch at the end of BB (to the
1240   // wrong place), but we want a load. It's easiest to just remove
1241   // the branch entirely.
1242   std::prev(BB->end())->eraseFromParent();
1243   Builder.SetInsertPoint(BB);
1244   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
1245   // Atomics require at least natural alignment.
1246   InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
1247   Builder.CreateBr(LoopBB);
1248 
1249   // Start the main loop block now that we've taken care of the preliminaries.
1250   Builder.SetInsertPoint(LoopBB);
1251   PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
1252   Loaded->addIncoming(InitLoaded, BB);
1253 
1254   Value *NewVal = PerformOp(Builder, Loaded);
1255 
1256   Value *NewLoaded = nullptr;
1257   Value *Success = nullptr;
1258 
1259   CreateCmpXchg(Builder, Addr, Loaded, NewVal,
1260                 MemOpOrder == AtomicOrdering::Unordered
1261                     ? AtomicOrdering::Monotonic
1262                     : MemOpOrder,
1263                 Success, NewLoaded);
1264   assert(Success && NewLoaded);
1265 
1266   Loaded->addIncoming(NewLoaded, LoopBB);
1267 
1268   Builder.CreateCondBr(Success, ExitBB, LoopBB);
1269 
1270   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1271   return NewLoaded;
1272 }
1273 
1274 // Note: This function is exposed externally by AtomicExpandUtils.h
expandAtomicRMWToCmpXchg(AtomicRMWInst * AI,CreateCmpXchgInstFun CreateCmpXchg)1275 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1276                                     CreateCmpXchgInstFun CreateCmpXchg) {
1277   IRBuilder<> Builder(AI);
1278   Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
1279       Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
1280       [&](IRBuilder<> &Builder, Value *Loaded) {
1281         return performAtomicOp(AI->getOperation(), Builder, Loaded,
1282                                AI->getValOperand());
1283       },
1284       CreateCmpXchg);
1285 
1286   AI->replaceAllUsesWith(Loaded);
1287   AI->eraseFromParent();
1288   return true;
1289 }
1290 
1291 // In order to use one of the sized library calls such as
1292 // __atomic_fetch_add_4, the alignment must be sufficient, the size
1293 // must be one of the potentially-specialized sizes, and the value
1294 // type must actually exist in C on the target (otherwise, the
1295 // function wouldn't actually be defined.)
canUseSizedAtomicCall(unsigned Size,unsigned Align,const DataLayout & DL)1296 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
1297                                   const DataLayout &DL) {
1298   // TODO: "LargestSize" is an approximation for "largest type that
1299   // you can express in C". It seems to be the case that int128 is
1300   // supported on all 64-bit platforms, otherwise only up to 64-bit
1301   // integers are supported. If we get this wrong, then we'll try to
1302   // call a sized libcall that doesn't actually exist. There should
1303   // really be some more reliable way in LLVM of determining integer
1304   // sizes which are valid in the target's C ABI...
1305   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
1306   return Align >= Size &&
1307          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
1308          Size <= LargestSize;
1309 }
1310 
expandAtomicLoadToLibcall(LoadInst * I)1311 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
1312   static const RTLIB::Libcall Libcalls[6] = {
1313       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1314       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1315   unsigned Size = getAtomicOpSize(I);
1316   unsigned Align = getAtomicOpAlign(I);
1317 
1318   bool expanded = expandAtomicOpToLibcall(
1319       I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
1320       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1321   (void)expanded;
1322   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
1323 }
1324 
expandAtomicStoreToLibcall(StoreInst * I)1325 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
1326   static const RTLIB::Libcall Libcalls[6] = {
1327       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1328       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1329   unsigned Size = getAtomicOpSize(I);
1330   unsigned Align = getAtomicOpAlign(I);
1331 
1332   bool expanded = expandAtomicOpToLibcall(
1333       I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
1334       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1335   (void)expanded;
1336   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
1337 }
1338 
expandAtomicCASToLibcall(AtomicCmpXchgInst * I)1339 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1340   static const RTLIB::Libcall Libcalls[6] = {
1341       RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1342       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1343       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1344   unsigned Size = getAtomicOpSize(I);
1345   unsigned Align = getAtomicOpAlign(I);
1346 
1347   bool expanded = expandAtomicOpToLibcall(
1348       I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
1349       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
1350       Libcalls);
1351   (void)expanded;
1352   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
1353 }
1354 
GetRMWLibcall(AtomicRMWInst::BinOp Op)1355 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1356   static const RTLIB::Libcall LibcallsXchg[6] = {
1357       RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
1358       RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1359       RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1360   static const RTLIB::Libcall LibcallsAdd[6] = {
1361       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
1362       RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1363       RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1364   static const RTLIB::Libcall LibcallsSub[6] = {
1365       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
1366       RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1367       RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1368   static const RTLIB::Libcall LibcallsAnd[6] = {
1369       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
1370       RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1371       RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1372   static const RTLIB::Libcall LibcallsOr[6] = {
1373       RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
1374       RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1375       RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1376   static const RTLIB::Libcall LibcallsXor[6] = {
1377       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
1378       RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1379       RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1380   static const RTLIB::Libcall LibcallsNand[6] = {
1381       RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
1382       RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1383       RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1384 
1385   switch (Op) {
1386   case AtomicRMWInst::BAD_BINOP:
1387     llvm_unreachable("Should not have BAD_BINOP.");
1388   case AtomicRMWInst::Xchg:
1389     return makeArrayRef(LibcallsXchg);
1390   case AtomicRMWInst::Add:
1391     return makeArrayRef(LibcallsAdd);
1392   case AtomicRMWInst::Sub:
1393     return makeArrayRef(LibcallsSub);
1394   case AtomicRMWInst::And:
1395     return makeArrayRef(LibcallsAnd);
1396   case AtomicRMWInst::Or:
1397     return makeArrayRef(LibcallsOr);
1398   case AtomicRMWInst::Xor:
1399     return makeArrayRef(LibcallsXor);
1400   case AtomicRMWInst::Nand:
1401     return makeArrayRef(LibcallsNand);
1402   case AtomicRMWInst::Max:
1403   case AtomicRMWInst::Min:
1404   case AtomicRMWInst::UMax:
1405   case AtomicRMWInst::UMin:
1406     // No atomic libcalls are available for max/min/umax/umin.
1407     return {};
1408   }
1409   llvm_unreachable("Unexpected AtomicRMW operation.");
1410 }
1411 
expandAtomicRMWToLibcall(AtomicRMWInst * I)1412 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1413   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
1414 
1415   unsigned Size = getAtomicOpSize(I);
1416   unsigned Align = getAtomicOpAlign(I);
1417 
1418   bool Success = false;
1419   if (!Libcalls.empty())
1420     Success = expandAtomicOpToLibcall(
1421         I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
1422         I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1423 
1424   // The expansion failed: either there were no libcalls at all for
1425   // the operation (min/max), or there were only size-specialized
1426   // libcalls (add/sub/etc) and we needed a generic. So, expand to a
1427   // CAS libcall, via a CAS loop, instead.
1428   if (!Success) {
1429     expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
1430                                        Value *Loaded, Value *NewVal,
1431                                        AtomicOrdering MemOpOrder,
1432                                        Value *&Success, Value *&NewLoaded) {
1433       // Create the CAS instruction normally...
1434       AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1435           Addr, Loaded, NewVal, MemOpOrder,
1436           AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
1437       Success = Builder.CreateExtractValue(Pair, 1, "success");
1438       NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
1439 
1440       // ...and then expand the CAS into a libcall.
1441       expandAtomicCASToLibcall(Pair);
1442     });
1443   }
1444 }
1445 
1446 // A helper routine for the above expandAtomic*ToLibcall functions.
1447 //
1448 // 'Libcalls' contains an array of enum values for the particular
1449 // ATOMIC libcalls to be emitted. All of the other arguments besides
1450 // 'I' are extracted from the Instruction subclass by the
1451 // caller. Depending on the particular call, some will be null.
expandAtomicOpToLibcall(Instruction * I,unsigned Size,unsigned Align,Value * PointerOperand,Value * ValueOperand,Value * CASExpected,AtomicOrdering Ordering,AtomicOrdering Ordering2,ArrayRef<RTLIB::Libcall> Libcalls)1452 bool AtomicExpand::expandAtomicOpToLibcall(
1453     Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
1454     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
1455     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1456   assert(Libcalls.size() == 6);
1457 
1458   LLVMContext &Ctx = I->getContext();
1459   Module *M = I->getModule();
1460   const DataLayout &DL = M->getDataLayout();
1461   IRBuilder<> Builder(I);
1462   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1463 
1464   bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
1465   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
1466 
1467   unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
1468 
1469   // TODO: the "order" argument type is "int", not int32. So
1470   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1471   ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
1472   assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1473   Constant *OrderingVal =
1474       ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
1475   Constant *Ordering2Val = nullptr;
1476   if (CASExpected) {
1477     assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1478     Ordering2Val =
1479         ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
1480   }
1481   bool HasResult = I->getType() != Type::getVoidTy(Ctx);
1482 
1483   RTLIB::Libcall RTLibType;
1484   if (UseSizedLibcall) {
1485     switch (Size) {
1486     case 1: RTLibType = Libcalls[1]; break;
1487     case 2: RTLibType = Libcalls[2]; break;
1488     case 4: RTLibType = Libcalls[3]; break;
1489     case 8: RTLibType = Libcalls[4]; break;
1490     case 16: RTLibType = Libcalls[5]; break;
1491     }
1492   } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
1493     RTLibType = Libcalls[0];
1494   } else {
1495     // Can't use sized function, and there's no generic for this
1496     // operation, so give up.
1497     return false;
1498   }
1499 
1500   // Build up the function call. There's two kinds. First, the sized
1501   // variants.  These calls are going to be one of the following (with
1502   // N=1,2,4,8,16):
1503   //  iN    __atomic_load_N(iN *ptr, int ordering)
1504   //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
1505   //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
1506   //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
1507   //                                    int success_order, int failure_order)
1508   //
1509   // Note that these functions can be used for non-integer atomic
1510   // operations, the values just need to be bitcast to integers on the
1511   // way in and out.
1512   //
1513   // And, then, the generic variants. They look like the following:
1514   //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
1515   //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
1516   //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
1517   //                          int ordering)
1518   //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
1519   //                                  void *desired, int success_order,
1520   //                                  int failure_order)
1521   //
1522   // The different signatures are built up depending on the
1523   // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
1524   // variables.
1525 
1526   AllocaInst *AllocaCASExpected = nullptr;
1527   Value *AllocaCASExpected_i8 = nullptr;
1528   AllocaInst *AllocaValue = nullptr;
1529   Value *AllocaValue_i8 = nullptr;
1530   AllocaInst *AllocaResult = nullptr;
1531   Value *AllocaResult_i8 = nullptr;
1532 
1533   Type *ResultTy;
1534   SmallVector<Value *, 6> Args;
1535   AttributeSet Attr;
1536 
1537   // 'size' argument.
1538   if (!UseSizedLibcall) {
1539     // Note, getIntPtrType is assumed equivalent to size_t.
1540     Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
1541   }
1542 
1543   // 'ptr' argument.
1544   Value *PtrVal =
1545       Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
1546   Args.push_back(PtrVal);
1547 
1548   // 'expected' argument, if present.
1549   if (CASExpected) {
1550     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
1551     AllocaCASExpected->setAlignment(AllocaAlignment);
1552     AllocaCASExpected_i8 =
1553         Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
1554     Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
1555     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
1556     Args.push_back(AllocaCASExpected_i8);
1557   }
1558 
1559   // 'val' argument ('desired' for cas), if present.
1560   if (ValueOperand) {
1561     if (UseSizedLibcall) {
1562       Value *IntValue =
1563           Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
1564       Args.push_back(IntValue);
1565     } else {
1566       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
1567       AllocaValue->setAlignment(AllocaAlignment);
1568       AllocaValue_i8 =
1569           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
1570       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
1571       Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
1572       Args.push_back(AllocaValue_i8);
1573     }
1574   }
1575 
1576   // 'ret' argument.
1577   if (!CASExpected && HasResult && !UseSizedLibcall) {
1578     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
1579     AllocaResult->setAlignment(AllocaAlignment);
1580     AllocaResult_i8 =
1581         Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
1582     Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
1583     Args.push_back(AllocaResult_i8);
1584   }
1585 
1586   // 'ordering' ('success_order' for cas) argument.
1587   Args.push_back(OrderingVal);
1588 
1589   // 'failure_order' argument, if present.
1590   if (Ordering2Val)
1591     Args.push_back(Ordering2Val);
1592 
1593   // Now, the return type.
1594   if (CASExpected) {
1595     ResultTy = Type::getInt1Ty(Ctx);
1596     Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
1597   } else if (HasResult && UseSizedLibcall)
1598     ResultTy = SizedIntTy;
1599   else
1600     ResultTy = Type::getVoidTy(Ctx);
1601 
1602   // Done with setting up arguments and return types, create the call:
1603   SmallVector<Type *, 6> ArgTys;
1604   for (Value *Arg : Args)
1605     ArgTys.push_back(Arg->getType());
1606   FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
1607   Constant *LibcallFn =
1608       M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
1609   CallInst *Call = Builder.CreateCall(LibcallFn, Args);
1610   Call->setAttributes(Attr);
1611   Value *Result = Call;
1612 
1613   // And then, extract the results...
1614   if (ValueOperand && !UseSizedLibcall)
1615     Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
1616 
1617   if (CASExpected) {
1618     // The final result from the CAS is {load of 'expected' alloca, bool result
1619     // from call}
1620     Type *FinalResultTy = I->getType();
1621     Value *V = UndefValue::get(FinalResultTy);
1622     Value *ExpectedOut =
1623         Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
1624     Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
1625     V = Builder.CreateInsertValue(V, ExpectedOut, 0);
1626     V = Builder.CreateInsertValue(V, Result, 1);
1627     I->replaceAllUsesWith(V);
1628   } else if (HasResult) {
1629     Value *V;
1630     if (UseSizedLibcall)
1631       V = Builder.CreateBitOrPointerCast(Result, I->getType());
1632     else {
1633       V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
1634       Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
1635     }
1636     I->replaceAllUsesWith(V);
1637   }
1638   I->eraseFromParent();
1639   return true;
1640 }
1641