1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the Loop Distribution Pass. Its main focus is to
11 // distribute loops that cannot be vectorized due to dependence cycles. It
12 // tries to isolate the offending dependences into a new loop allowing
13 // vectorization of the remaining parts.
14 //
15 // For dependence analysis, the pass uses the LoopVectorizer's
16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of
17 // memory operations, special care is taken to preserve the lexical order of
18 // these operations.
19 //
20 // Similarly to the Vectorizer, the pass also supports loop versioning to
21 // run-time disambiguate potentially overlapping arrays.
22 //
23 //===----------------------------------------------------------------------===//
24
25 #include "llvm/ADT/DepthFirstIterator.h"
26 #include "llvm/ADT/EquivalenceClasses.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/Analysis/LoopAccessAnalysis.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/DiagnosticInfo.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
37 #include "llvm/Transforms/Utils/Cloning.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/LoopVersioning.h"
40 #include <list>
41
42 #define LDIST_NAME "loop-distribute"
43 #define DEBUG_TYPE LDIST_NAME
44
45 using namespace llvm;
46
47 static cl::opt<bool>
48 LDistVerify("loop-distribute-verify", cl::Hidden,
49 cl::desc("Turn on DominatorTree and LoopInfo verification "
50 "after Loop Distribution"),
51 cl::init(false));
52
53 static cl::opt<bool> DistributeNonIfConvertible(
54 "loop-distribute-non-if-convertible", cl::Hidden,
55 cl::desc("Whether to distribute into a loop that may not be "
56 "if-convertible by the loop vectorizer"),
57 cl::init(false));
58
59 static cl::opt<unsigned> DistributeSCEVCheckThreshold(
60 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
61 cl::desc("The maximum number of SCEV checks allowed for Loop "
62 "Distribution"));
63
64 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
65 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
66 cl::Hidden,
67 cl::desc(
68 "The maximum number of SCEV checks allowed for Loop "
69 "Distribution for loop marked with #pragma loop distribute(enable)"));
70
71 // Note that the initial value for this depends on whether the pass is invoked
72 // directly or from the optimization pipeline.
73 static cl::opt<bool> EnableLoopDistribute(
74 "enable-loop-distribute", cl::Hidden,
75 cl::desc("Enable the new, experimental LoopDistribution Pass"));
76
77 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
78
79 namespace {
80 /// \brief Maintains the set of instructions of the loop for a partition before
81 /// cloning. After cloning, it hosts the new loop.
82 class InstPartition {
83 typedef SmallPtrSet<Instruction *, 8> InstructionSet;
84
85 public:
InstPartition(Instruction * I,Loop * L,bool DepCycle=false)86 InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
87 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
88 Set.insert(I);
89 }
90
91 /// \brief Returns whether this partition contains a dependence cycle.
hasDepCycle() const92 bool hasDepCycle() const { return DepCycle; }
93
94 /// \brief Adds an instruction to this partition.
add(Instruction * I)95 void add(Instruction *I) { Set.insert(I); }
96
97 /// \brief Collection accessors.
begin()98 InstructionSet::iterator begin() { return Set.begin(); }
end()99 InstructionSet::iterator end() { return Set.end(); }
begin() const100 InstructionSet::const_iterator begin() const { return Set.begin(); }
end() const101 InstructionSet::const_iterator end() const { return Set.end(); }
empty() const102 bool empty() const { return Set.empty(); }
103
104 /// \brief Moves this partition into \p Other. This partition becomes empty
105 /// after this.
moveTo(InstPartition & Other)106 void moveTo(InstPartition &Other) {
107 Other.Set.insert(Set.begin(), Set.end());
108 Set.clear();
109 Other.DepCycle |= DepCycle;
110 }
111
112 /// \brief Populates the partition with a transitive closure of all the
113 /// instructions that the seeded instructions dependent on.
populateUsedSet()114 void populateUsedSet() {
115 // FIXME: We currently don't use control-dependence but simply include all
116 // blocks (possibly empty at the end) and let simplifycfg mostly clean this
117 // up.
118 for (auto *B : OrigLoop->getBlocks())
119 Set.insert(B->getTerminator());
120
121 // Follow the use-def chains to form a transitive closure of all the
122 // instructions that the originally seeded instructions depend on.
123 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
124 while (!Worklist.empty()) {
125 Instruction *I = Worklist.pop_back_val();
126 // Insert instructions from the loop that we depend on.
127 for (Value *V : I->operand_values()) {
128 auto *I = dyn_cast<Instruction>(V);
129 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
130 Worklist.push_back(I);
131 }
132 }
133 }
134
135 /// \brief Clones the original loop.
136 ///
137 /// Updates LoopInfo and DominatorTree using the information that block \p
138 /// LoopDomBB dominates the loop.
cloneLoopWithPreheader(BasicBlock * InsertBefore,BasicBlock * LoopDomBB,unsigned Index,LoopInfo * LI,DominatorTree * DT)139 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
140 unsigned Index, LoopInfo *LI,
141 DominatorTree *DT) {
142 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
143 VMap, Twine(".ldist") + Twine(Index),
144 LI, DT, ClonedLoopBlocks);
145 return ClonedLoop;
146 }
147
148 /// \brief The cloned loop. If this partition is mapped to the original loop,
149 /// this is null.
getClonedLoop() const150 const Loop *getClonedLoop() const { return ClonedLoop; }
151
152 /// \brief Returns the loop where this partition ends up after distribution.
153 /// If this partition is mapped to the original loop then use the block from
154 /// the loop.
getDistributedLoop() const155 const Loop *getDistributedLoop() const {
156 return ClonedLoop ? ClonedLoop : OrigLoop;
157 }
158
159 /// \brief The VMap that is populated by cloning and then used in
160 /// remapinstruction to remap the cloned instructions.
getVMap()161 ValueToValueMapTy &getVMap() { return VMap; }
162
163 /// \brief Remaps the cloned instructions using VMap.
remapInstructions()164 void remapInstructions() {
165 remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
166 }
167
168 /// \brief Based on the set of instructions selected for this partition,
169 /// removes the unnecessary ones.
removeUnusedInsts()170 void removeUnusedInsts() {
171 SmallVector<Instruction *, 8> Unused;
172
173 for (auto *Block : OrigLoop->getBlocks())
174 for (auto &Inst : *Block)
175 if (!Set.count(&Inst)) {
176 Instruction *NewInst = &Inst;
177 if (!VMap.empty())
178 NewInst = cast<Instruction>(VMap[NewInst]);
179
180 assert(!isa<BranchInst>(NewInst) &&
181 "Branches are marked used early on");
182 Unused.push_back(NewInst);
183 }
184
185 // Delete the instructions backwards, as it has a reduced likelihood of
186 // having to update as many def-use and use-def chains.
187 for (auto *Inst : reverse(Unused)) {
188 if (!Inst->use_empty())
189 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
190 Inst->eraseFromParent();
191 }
192 }
193
print() const194 void print() const {
195 if (DepCycle)
196 dbgs() << " (cycle)\n";
197 for (auto *I : Set)
198 // Prefix with the block name.
199 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
200 }
201
printBlocks() const202 void printBlocks() const {
203 for (auto *BB : getDistributedLoop()->getBlocks())
204 dbgs() << *BB;
205 }
206
207 private:
208 /// \brief Instructions from OrigLoop selected for this partition.
209 InstructionSet Set;
210
211 /// \brief Whether this partition contains a dependence cycle.
212 bool DepCycle;
213
214 /// \brief The original loop.
215 Loop *OrigLoop;
216
217 /// \brief The cloned loop. If this partition is mapped to the original loop,
218 /// this is null.
219 Loop *ClonedLoop;
220
221 /// \brief The blocks of ClonedLoop including the preheader. If this
222 /// partition is mapped to the original loop, this is empty.
223 SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
224
225 /// \brief These gets populated once the set of instructions have been
226 /// finalized. If this partition is mapped to the original loop, these are not
227 /// set.
228 ValueToValueMapTy VMap;
229 };
230
231 /// \brief Holds the set of Partitions. It populates them, merges them and then
232 /// clones the loops.
233 class InstPartitionContainer {
234 typedef DenseMap<Instruction *, int> InstToPartitionIdT;
235
236 public:
InstPartitionContainer(Loop * L,LoopInfo * LI,DominatorTree * DT)237 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
238 : L(L), LI(LI), DT(DT) {}
239
240 /// \brief Returns the number of partitions.
getSize() const241 unsigned getSize() const { return PartitionContainer.size(); }
242
243 /// \brief Adds \p Inst into the current partition if that is marked to
244 /// contain cycles. Otherwise start a new partition for it.
addToCyclicPartition(Instruction * Inst)245 void addToCyclicPartition(Instruction *Inst) {
246 // If the current partition is non-cyclic. Start a new one.
247 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
248 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
249 else
250 PartitionContainer.back().add(Inst);
251 }
252
253 /// \brief Adds \p Inst into a partition that is not marked to contain
254 /// dependence cycles.
255 ///
256 // Initially we isolate memory instructions into as many partitions as
257 // possible, then later we may merge them back together.
addToNewNonCyclicPartition(Instruction * Inst)258 void addToNewNonCyclicPartition(Instruction *Inst) {
259 PartitionContainer.emplace_back(Inst, L);
260 }
261
262 /// \brief Merges adjacent non-cyclic partitions.
263 ///
264 /// The idea is that we currently only want to isolate the non-vectorizable
265 /// partition. We could later allow more distribution among these partition
266 /// too.
mergeAdjacentNonCyclic()267 void mergeAdjacentNonCyclic() {
268 mergeAdjacentPartitionsIf(
269 [](const InstPartition *P) { return !P->hasDepCycle(); });
270 }
271
272 /// \brief If a partition contains only conditional stores, we won't vectorize
273 /// it. Try to merge it with a previous cyclic partition.
mergeNonIfConvertible()274 void mergeNonIfConvertible() {
275 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
276 if (Partition->hasDepCycle())
277 return true;
278
279 // Now, check if all stores are conditional in this partition.
280 bool seenStore = false;
281
282 for (auto *Inst : *Partition)
283 if (isa<StoreInst>(Inst)) {
284 seenStore = true;
285 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
286 return false;
287 }
288 return seenStore;
289 });
290 }
291
292 /// \brief Merges the partitions according to various heuristics.
mergeBeforePopulating()293 void mergeBeforePopulating() {
294 mergeAdjacentNonCyclic();
295 if (!DistributeNonIfConvertible)
296 mergeNonIfConvertible();
297 }
298
299 /// \brief Merges partitions in order to ensure that no loads are duplicated.
300 ///
301 /// We can't duplicate loads because that could potentially reorder them.
302 /// LoopAccessAnalysis provides dependency information with the context that
303 /// the order of memory operation is preserved.
304 ///
305 /// Return if any partitions were merged.
mergeToAvoidDuplicatedLoads()306 bool mergeToAvoidDuplicatedLoads() {
307 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
308 typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
309
310 LoadToPartitionT LoadToPartition;
311 ToBeMergedT ToBeMerged;
312
313 // Step through the partitions and create equivalence between partitions
314 // that contain the same load. Also put partitions in between them in the
315 // same equivalence class to avoid reordering of memory operations.
316 for (PartitionContainerT::iterator I = PartitionContainer.begin(),
317 E = PartitionContainer.end();
318 I != E; ++I) {
319 auto *PartI = &*I;
320
321 // If a load occurs in two partitions PartI and PartJ, merge all
322 // partitions (PartI, PartJ] into PartI.
323 for (Instruction *Inst : *PartI)
324 if (isa<LoadInst>(Inst)) {
325 bool NewElt;
326 LoadToPartitionT::iterator LoadToPart;
327
328 std::tie(LoadToPart, NewElt) =
329 LoadToPartition.insert(std::make_pair(Inst, PartI));
330 if (!NewElt) {
331 DEBUG(dbgs() << "Merging partitions due to this load in multiple "
332 << "partitions: " << PartI << ", "
333 << LoadToPart->second << "\n" << *Inst << "\n");
334
335 auto PartJ = I;
336 do {
337 --PartJ;
338 ToBeMerged.unionSets(PartI, &*PartJ);
339 } while (&*PartJ != LoadToPart->second);
340 }
341 }
342 }
343 if (ToBeMerged.empty())
344 return false;
345
346 // Merge the member of an equivalence class into its class leader. This
347 // makes the members empty.
348 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
349 I != E; ++I) {
350 if (!I->isLeader())
351 continue;
352
353 auto PartI = I->getData();
354 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
355 ToBeMerged.member_end())) {
356 PartJ->moveTo(*PartI);
357 }
358 }
359
360 // Remove the empty partitions.
361 PartitionContainer.remove_if(
362 [](const InstPartition &P) { return P.empty(); });
363
364 return true;
365 }
366
367 /// \brief Sets up the mapping between instructions to partitions. If the
368 /// instruction is duplicated across multiple partitions, set the entry to -1.
setupPartitionIdOnInstructions()369 void setupPartitionIdOnInstructions() {
370 int PartitionID = 0;
371 for (const auto &Partition : PartitionContainer) {
372 for (Instruction *Inst : Partition) {
373 bool NewElt;
374 InstToPartitionIdT::iterator Iter;
375
376 std::tie(Iter, NewElt) =
377 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
378 if (!NewElt)
379 Iter->second = -1;
380 }
381 ++PartitionID;
382 }
383 }
384
385 /// \brief Populates the partition with everything that the seeding
386 /// instructions require.
populateUsedSet()387 void populateUsedSet() {
388 for (auto &P : PartitionContainer)
389 P.populateUsedSet();
390 }
391
392 /// \brief This performs the main chunk of the work of cloning the loops for
393 /// the partitions.
cloneLoops()394 void cloneLoops() {
395 BasicBlock *OrigPH = L->getLoopPreheader();
396 // At this point the predecessor of the preheader is either the memcheck
397 // block or the top part of the original preheader.
398 BasicBlock *Pred = OrigPH->getSinglePredecessor();
399 assert(Pred && "Preheader does not have a single predecessor");
400 BasicBlock *ExitBlock = L->getExitBlock();
401 assert(ExitBlock && "No single exit block");
402 Loop *NewLoop;
403
404 assert(!PartitionContainer.empty() && "at least two partitions expected");
405 // We're cloning the preheader along with the loop so we already made sure
406 // it was empty.
407 assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
408 "preheader not empty");
409
410 // Create a loop for each partition except the last. Clone the original
411 // loop before PH along with adding a preheader for the cloned loop. Then
412 // update PH to point to the newly added preheader.
413 BasicBlock *TopPH = OrigPH;
414 unsigned Index = getSize() - 1;
415 for (auto I = std::next(PartitionContainer.rbegin()),
416 E = PartitionContainer.rend();
417 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
418 auto *Part = &*I;
419
420 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
421
422 Part->getVMap()[ExitBlock] = TopPH;
423 Part->remapInstructions();
424 }
425 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
426
427 // Now go in forward order and update the immediate dominator for the
428 // preheaders with the exiting block of the previous loop. Dominance
429 // within the loop is updated in cloneLoopWithPreheader.
430 for (auto Curr = PartitionContainer.cbegin(),
431 Next = std::next(PartitionContainer.cbegin()),
432 E = PartitionContainer.cend();
433 Next != E; ++Curr, ++Next)
434 DT->changeImmediateDominator(
435 Next->getDistributedLoop()->getLoopPreheader(),
436 Curr->getDistributedLoop()->getExitingBlock());
437 }
438
439 /// \brief Removes the dead instructions from the cloned loops.
removeUnusedInsts()440 void removeUnusedInsts() {
441 for (auto &Partition : PartitionContainer)
442 Partition.removeUnusedInsts();
443 }
444
445 /// \brief For each memory pointer, it computes the partitionId the pointer is
446 /// used in.
447 ///
448 /// This returns an array of int where the I-th entry corresponds to I-th
449 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
450 /// partitions its entry is set to -1.
451 SmallVector<int, 8>
computePartitionSetForPointers(const LoopAccessInfo & LAI)452 computePartitionSetForPointers(const LoopAccessInfo &LAI) {
453 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
454
455 unsigned N = RtPtrCheck->Pointers.size();
456 SmallVector<int, 8> PtrToPartitions(N);
457 for (unsigned I = 0; I < N; ++I) {
458 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
459 auto Instructions =
460 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
461
462 int &Partition = PtrToPartitions[I];
463 // First set it to uninitialized.
464 Partition = -2;
465 for (Instruction *Inst : Instructions) {
466 // Note that this could be -1 if Inst is duplicated across multiple
467 // partitions.
468 int ThisPartition = this->InstToPartitionId[Inst];
469 if (Partition == -2)
470 Partition = ThisPartition;
471 // -1 means belonging to multiple partitions.
472 else if (Partition == -1)
473 break;
474 else if (Partition != (int)ThisPartition)
475 Partition = -1;
476 }
477 assert(Partition != -2 && "Pointer not belonging to any partition");
478 }
479
480 return PtrToPartitions;
481 }
482
print(raw_ostream & OS) const483 void print(raw_ostream &OS) const {
484 unsigned Index = 0;
485 for (const auto &P : PartitionContainer) {
486 OS << "Partition " << Index++ << " (" << &P << "):\n";
487 P.print();
488 }
489 }
490
dump() const491 void dump() const { print(dbgs()); }
492
493 #ifndef NDEBUG
operator <<(raw_ostream & OS,const InstPartitionContainer & Partitions)494 friend raw_ostream &operator<<(raw_ostream &OS,
495 const InstPartitionContainer &Partitions) {
496 Partitions.print(OS);
497 return OS;
498 }
499 #endif
500
printBlocks() const501 void printBlocks() const {
502 unsigned Index = 0;
503 for (const auto &P : PartitionContainer) {
504 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
505 P.printBlocks();
506 }
507 }
508
509 private:
510 typedef std::list<InstPartition> PartitionContainerT;
511
512 /// \brief List of partitions.
513 PartitionContainerT PartitionContainer;
514
515 /// \brief Mapping from Instruction to partition Id. If the instruction
516 /// belongs to multiple partitions the entry contains -1.
517 InstToPartitionIdT InstToPartitionId;
518
519 Loop *L;
520 LoopInfo *LI;
521 DominatorTree *DT;
522
523 /// \brief The control structure to merge adjacent partitions if both satisfy
524 /// the \p Predicate.
525 template <class UnaryPredicate>
mergeAdjacentPartitionsIf(UnaryPredicate Predicate)526 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
527 InstPartition *PrevMatch = nullptr;
528 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
529 auto DoesMatch = Predicate(&*I);
530 if (PrevMatch == nullptr && DoesMatch) {
531 PrevMatch = &*I;
532 ++I;
533 } else if (PrevMatch != nullptr && DoesMatch) {
534 I->moveTo(*PrevMatch);
535 I = PartitionContainer.erase(I);
536 } else {
537 PrevMatch = nullptr;
538 ++I;
539 }
540 }
541 }
542 };
543
544 /// \brief For each memory instruction, this class maintains difference of the
545 /// number of unsafe dependences that start out from this instruction minus
546 /// those that end here.
547 ///
548 /// By traversing the memory instructions in program order and accumulating this
549 /// number, we know whether any unsafe dependence crosses over a program point.
550 class MemoryInstructionDependences {
551 typedef MemoryDepChecker::Dependence Dependence;
552
553 public:
554 struct Entry {
555 Instruction *Inst;
556 unsigned NumUnsafeDependencesStartOrEnd;
557
Entry__anonb8d124ee0111::MemoryInstructionDependences::Entry558 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
559 };
560
561 typedef SmallVector<Entry, 8> AccessesType;
562
begin() const563 AccessesType::const_iterator begin() const { return Accesses.begin(); }
end() const564 AccessesType::const_iterator end() const { return Accesses.end(); }
565
MemoryInstructionDependences(const SmallVectorImpl<Instruction * > & Instructions,const SmallVectorImpl<Dependence> & Dependences)566 MemoryInstructionDependences(
567 const SmallVectorImpl<Instruction *> &Instructions,
568 const SmallVectorImpl<Dependence> &Dependences) {
569 Accesses.append(Instructions.begin(), Instructions.end());
570
571 DEBUG(dbgs() << "Backward dependences:\n");
572 for (auto &Dep : Dependences)
573 if (Dep.isPossiblyBackward()) {
574 // Note that the designations source and destination follow the program
575 // order, i.e. source is always first. (The direction is given by the
576 // DepType.)
577 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
578 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
579
580 DEBUG(Dep.print(dbgs(), 2, Instructions));
581 }
582 }
583
584 private:
585 AccessesType Accesses;
586 };
587
588 /// \brief The actual class performing the per-loop work.
589 class LoopDistributeForLoop {
590 public:
LoopDistributeForLoop(Loop * L,Function * F,LoopInfo * LI,DominatorTree * DT,ScalarEvolution * SE)591 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
592 ScalarEvolution *SE)
593 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE) {
594 setForced();
595 }
596
597 /// \brief Try to distribute an inner-most loop.
processLoop(LoopAccessLegacyAnalysis * LAA)598 bool processLoop(LoopAccessLegacyAnalysis *LAA) {
599 assert(L->empty() && "Only process inner loops.");
600
601 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
602 << "\" checking " << *L << "\n");
603
604 BasicBlock *PH = L->getLoopPreheader();
605 if (!PH)
606 return fail("no preheader");
607 if (!L->getExitBlock())
608 return fail("multiple exit blocks");
609
610 // LAA will check that we only have a single exiting block.
611 LAI = &LAA->getInfo(L);
612
613 // Currently, we only distribute to isolate the part of the loop with
614 // dependence cycles to enable partial vectorization.
615 if (LAI->canVectorizeMemory())
616 return fail("memory operations are safe for vectorization");
617
618 auto *Dependences = LAI->getDepChecker().getDependences();
619 if (!Dependences || Dependences->empty())
620 return fail("no unsafe dependences to isolate");
621
622 InstPartitionContainer Partitions(L, LI, DT);
623
624 // First, go through each memory operation and assign them to consecutive
625 // partitions (the order of partitions follows program order). Put those
626 // with unsafe dependences into "cyclic" partition otherwise put each store
627 // in its own "non-cyclic" partition (we'll merge these later).
628 //
629 // Note that a memory operation (e.g. Load2 below) at a program point that
630 // has an unsafe dependence (Store3->Load1) spanning over it must be
631 // included in the same cyclic partition as the dependent operations. This
632 // is to preserve the original program order after distribution. E.g.:
633 //
634 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
635 // Load1 -. 1 0->1
636 // Load2 | /Unsafe/ 0 1
637 // Store3 -' -1 1->0
638 // Load4 0 0
639 //
640 // NumUnsafeDependencesActive > 0 indicates this situation and in this case
641 // we just keep assigning to the same cyclic partition until
642 // NumUnsafeDependencesActive reaches 0.
643 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
644 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
645 *Dependences);
646
647 int NumUnsafeDependencesActive = 0;
648 for (auto &InstDep : MID) {
649 Instruction *I = InstDep.Inst;
650 // We update NumUnsafeDependencesActive post-instruction, catch the
651 // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
652 if (NumUnsafeDependencesActive ||
653 InstDep.NumUnsafeDependencesStartOrEnd > 0)
654 Partitions.addToCyclicPartition(I);
655 else
656 Partitions.addToNewNonCyclicPartition(I);
657 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
658 assert(NumUnsafeDependencesActive >= 0 &&
659 "Negative number of dependences active");
660 }
661
662 // Add partitions for values used outside. These partitions can be out of
663 // order from the original program order. This is OK because if the
664 // partition uses a load we will merge this partition with the original
665 // partition of the load that we set up in the previous loop (see
666 // mergeToAvoidDuplicatedLoads).
667 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
668 for (auto *Inst : DefsUsedOutside)
669 Partitions.addToNewNonCyclicPartition(Inst);
670
671 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
672 if (Partitions.getSize() < 2)
673 return fail("cannot isolate unsafe dependencies");
674
675 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
676 // should be able to vectorize these together.
677 Partitions.mergeBeforePopulating();
678 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
679 if (Partitions.getSize() < 2)
680 return fail("cannot isolate unsafe dependencies");
681
682 // Now, populate the partitions with non-memory operations.
683 Partitions.populateUsedSet();
684 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
685
686 // In order to preserve original lexical order for loads, keep them in the
687 // partition that we set up in the MemoryInstructionDependences loop.
688 if (Partitions.mergeToAvoidDuplicatedLoads()) {
689 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
690 << Partitions);
691 if (Partitions.getSize() < 2)
692 return fail("cannot isolate unsafe dependencies");
693 }
694
695 // Don't distribute the loop if we need too many SCEV run-time checks.
696 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
697 if (Pred.getComplexity() > (IsForced.getValueOr(false)
698 ? PragmaDistributeSCEVCheckThreshold
699 : DistributeSCEVCheckThreshold))
700 return fail("too many SCEV run-time checks needed.\n");
701
702 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
703 // We're done forming the partitions set up the reverse mapping from
704 // instructions to partitions.
705 Partitions.setupPartitionIdOnInstructions();
706
707 // To keep things simple have an empty preheader before we version or clone
708 // the loop. (Also split if this has no predecessor, i.e. entry, because we
709 // rely on PH having a predecessor.)
710 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
711 SplitBlock(PH, PH->getTerminator(), DT, LI);
712
713 // If we need run-time checks, version the loop now.
714 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
715 const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
716 const auto &AllChecks = RtPtrChecking->getChecks();
717 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
718 RtPtrChecking);
719
720 if (!Pred.isAlwaysTrue() || !Checks.empty()) {
721 DEBUG(dbgs() << "\nPointers:\n");
722 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
723 LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
724 LVer.setAliasChecks(std::move(Checks));
725 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
726 LVer.versionLoop(DefsUsedOutside);
727 LVer.annotateLoopWithNoAlias();
728 }
729
730 // Create identical copies of the original loop for each partition and hook
731 // them up sequentially.
732 Partitions.cloneLoops();
733
734 // Now, we remove the instruction from each loop that don't belong to that
735 // partition.
736 Partitions.removeUnusedInsts();
737 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
738 DEBUG(Partitions.printBlocks());
739
740 if (LDistVerify) {
741 LI->verify();
742 DT->verifyDomTree();
743 }
744
745 ++NumLoopsDistributed;
746 // Report the success.
747 emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
748 "distributed loop");
749 return true;
750 }
751
752 /// \brief Provide diagnostics then \return with false.
fail(llvm::StringRef Message)753 bool fail(llvm::StringRef Message) {
754 LLVMContext &Ctx = F->getContext();
755 bool Forced = isForced().getValueOr(false);
756
757 DEBUG(dbgs() << "Skipping; " << Message << "\n");
758
759 // With Rpass-missed report that distribution failed.
760 emitOptimizationRemarkMissed(
761 Ctx, LDIST_NAME, *F, L->getStartLoc(),
762 "loop not distributed: use -Rpass-analysis=loop-distribute for more "
763 "info");
764
765 // With Rpass-analysis report why. This is on by default if distribution
766 // was requested explicitly.
767 emitOptimizationRemarkAnalysis(
768 Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
769 : LDIST_NAME,
770 *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
771
772 // Also issue a warning if distribution was requested explicitly but it
773 // failed.
774 if (Forced)
775 Ctx.diagnose(DiagnosticInfoOptimizationFailure(
776 *F, L->getStartLoc(), "loop not disributed: failed "
777 "explicitly specified loop distribution"));
778
779 return false;
780 }
781
782 /// \brief Return if distribution forced to be enabled/disabled for the loop.
783 ///
784 /// If the optional has a value, it indicates whether distribution was forced
785 /// to be enabled (true) or disabled (false). If the optional has no value
786 /// distribution was not forced either way.
isForced() const787 const Optional<bool> &isForced() const { return IsForced; }
788
789 private:
790 /// \brief Filter out checks between pointers from the same partition.
791 ///
792 /// \p PtrToPartition contains the partition number for pointers. Partition
793 /// number -1 means that the pointer is used in multiple partitions. In this
794 /// case we can't safely omit the check.
795 SmallVector<RuntimePointerChecking::PointerCheck, 4>
includeOnlyCrossPartitionChecks(const SmallVectorImpl<RuntimePointerChecking::PointerCheck> & AllChecks,const SmallVectorImpl<int> & PtrToPartition,const RuntimePointerChecking * RtPtrChecking)796 includeOnlyCrossPartitionChecks(
797 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
798 const SmallVectorImpl<int> &PtrToPartition,
799 const RuntimePointerChecking *RtPtrChecking) {
800 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
801
802 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
803 [&](const RuntimePointerChecking::PointerCheck &Check) {
804 for (unsigned PtrIdx1 : Check.first->Members)
805 for (unsigned PtrIdx2 : Check.second->Members)
806 // Only include this check if there is a pair of pointers
807 // that require checking and the pointers fall into
808 // separate partitions.
809 //
810 // (Note that we already know at this point that the two
811 // pointer groups need checking but it doesn't follow
812 // that each pair of pointers within the two groups need
813 // checking as well.
814 //
815 // In other words we don't want to include a check just
816 // because there is a pair of pointers between the two
817 // pointer groups that require checks and a different
818 // pair whose pointers fall into different partitions.)
819 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
820 !RuntimePointerChecking::arePointersInSamePartition(
821 PtrToPartition, PtrIdx1, PtrIdx2))
822 return true;
823 return false;
824 });
825
826 return Checks;
827 }
828
829 /// \brief Check whether the loop metadata is forcing distribution to be
830 /// enabled/disabled.
setForced()831 void setForced() {
832 Optional<const MDOperand *> Value =
833 findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
834 if (!Value)
835 return;
836
837 const MDOperand *Op = *Value;
838 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
839 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
840 }
841
842 Loop *L;
843 Function *F;
844
845 // Analyses used.
846 LoopInfo *LI;
847 const LoopAccessInfo *LAI;
848 DominatorTree *DT;
849 ScalarEvolution *SE;
850
851 /// \brief Indicates whether distribution is forced to be enabled/disabled for
852 /// the loop.
853 ///
854 /// If the optional has a value, it indicates whether distribution was forced
855 /// to be enabled (true) or disabled (false). If the optional has no value
856 /// distribution was not forced either way.
857 Optional<bool> IsForced;
858 };
859
860 /// \brief The pass class.
861 class LoopDistribute : public FunctionPass {
862 public:
863 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
864 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
865 /// default. We use this to keep LoopDistribution off by default when invoked
866 /// from the optimization pipeline but on when invoked explicitly from opt.
LoopDistribute(bool ProcessAllLoopsByDefault=true)867 LoopDistribute(bool ProcessAllLoopsByDefault = true)
868 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
869 // The default is set by the caller.
870 if (EnableLoopDistribute.getNumOccurrences() > 0)
871 ProcessAllLoops = EnableLoopDistribute;
872 initializeLoopDistributePass(*PassRegistry::getPassRegistry());
873 }
874
runOnFunction(Function & F)875 bool runOnFunction(Function &F) override {
876 if (skipFunction(F))
877 return false;
878
879 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
880 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
881 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
882 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
883
884 // Build up a worklist of inner-loops to vectorize. This is necessary as the
885 // act of distributing a loop creates new loops and can invalidate iterators
886 // across the loops.
887 SmallVector<Loop *, 8> Worklist;
888
889 for (Loop *TopLevelLoop : *LI)
890 for (Loop *L : depth_first(TopLevelLoop))
891 // We only handle inner-most loops.
892 if (L->empty())
893 Worklist.push_back(L);
894
895 // Now walk the identified inner loops.
896 bool Changed = false;
897 for (Loop *L : Worklist) {
898 LoopDistributeForLoop LDL(L, &F, LI, DT, SE);
899
900 // If distribution was forced for the specific loop to be
901 // enabled/disabled, follow that. Otherwise use the global flag.
902 if (LDL.isForced().getValueOr(ProcessAllLoops))
903 Changed |= LDL.processLoop(LAA);
904 }
905
906 // Process each loop nest in the function.
907 return Changed;
908 }
909
getAnalysisUsage(AnalysisUsage & AU) const910 void getAnalysisUsage(AnalysisUsage &AU) const override {
911 AU.addRequired<ScalarEvolutionWrapperPass>();
912 AU.addRequired<LoopInfoWrapperPass>();
913 AU.addPreserved<LoopInfoWrapperPass>();
914 AU.addRequired<LoopAccessLegacyAnalysis>();
915 AU.addRequired<DominatorTreeWrapperPass>();
916 AU.addPreserved<DominatorTreeWrapperPass>();
917 }
918
919 static char ID;
920
921 private:
922 /// \brief Whether distribution should be on in this function. The per-loop
923 /// pragma can override this.
924 bool ProcessAllLoops;
925 };
926 } // anonymous namespace
927
928 char LoopDistribute::ID;
929 static const char ldist_name[] = "Loop Distribition";
930
931 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
932 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
933 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
934 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
935 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
936 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
937
938 namespace llvm {
createLoopDistributePass(bool ProcessAllLoopsByDefault)939 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
940 return new LoopDistribute(ProcessAllLoopsByDefault);
941 }
942 }
943