1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This Pass handles loop interchange transform.
11 // This pass interchanges loops to provide a more cache-friendly memory access
12 // patterns.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/AliasAnalysis.h"
21 #include "llvm/Analysis/DependenceAnalysis.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constants.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/InstrTypes.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/IR/User.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/ErrorHandling.h"
42 #include "llvm/Support/raw_ostream.h"
43 #include "llvm/Transforms/Scalar.h"
44 #include "llvm/Transforms/Utils.h"
45 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
46 #include "llvm/Transforms/Utils/LoopUtils.h"
47 #include <cassert>
48 #include <utility>
49 #include <vector>
50 
51 using namespace llvm;
52 
53 #define DEBUG_TYPE "loop-interchange"
54 
55 STATISTIC(LoopsInterchanged, "Number of loops interchanged");
56 
57 static cl::opt<int> LoopInterchangeCostThreshold(
58     "loop-interchange-threshold", cl::init(0), cl::Hidden,
59     cl::desc("Interchange if you gain more than this number"));
60 
61 namespace {
62 
63 using LoopVector = SmallVector<Loop *, 8>;
64 
65 // TODO: Check if we can use a sparse matrix here.
66 using CharMatrix = std::vector<std::vector<char>>;
67 
68 } // end anonymous namespace
69 
70 // Maximum number of dependencies that can be handled in the dependency matrix.
71 static const unsigned MaxMemInstrCount = 100;
72 
73 // Maximum loop depth supported.
74 static const unsigned MaxLoopNestDepth = 10;
75 
76 #ifdef DUMP_DEP_MATRICIES
printDepMatrix(CharMatrix & DepMatrix)77 static void printDepMatrix(CharMatrix &DepMatrix) {
78   for (auto &Row : DepMatrix) {
79     for (auto D : Row)
80       LLVM_DEBUG(dbgs() << D << " ");
81     LLVM_DEBUG(dbgs() << "\n");
82   }
83 }
84 #endif
85 
populateDependencyMatrix(CharMatrix & DepMatrix,unsigned Level,Loop * L,DependenceInfo * DI)86 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
87                                      Loop *L, DependenceInfo *DI) {
88   using ValueVector = SmallVector<Value *, 16>;
89 
90   ValueVector MemInstr;
91 
92   // For each block.
93   for (BasicBlock *BB : L->blocks()) {
94     // Scan the BB and collect legal loads and stores.
95     for (Instruction &I : *BB) {
96       if (!isa<Instruction>(I))
97         return false;
98       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
99         if (!Ld->isSimple())
100           return false;
101         MemInstr.push_back(&I);
102       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
103         if (!St->isSimple())
104           return false;
105         MemInstr.push_back(&I);
106       }
107     }
108   }
109 
110   LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111                     << " Loads and Stores to analyze\n");
112 
113   ValueVector::iterator I, IE, J, JE;
114 
115   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
116     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
117       std::vector<char> Dep;
118       Instruction *Src = cast<Instruction>(*I);
119       Instruction *Dst = cast<Instruction>(*J);
120       if (Src == Dst)
121         continue;
122       // Ignore Input dependencies.
123       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
124         continue;
125       // Track Output, Flow, and Anti dependencies.
126       if (auto D = DI->depends(Src, Dst, true)) {
127         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
128         LLVM_DEBUG(StringRef DepType =
129                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
130                    dbgs() << "Found " << DepType
131                           << " dependency between Src and Dst\n"
132                           << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
133         unsigned Levels = D->getLevels();
134         char Direction;
135         for (unsigned II = 1; II <= Levels; ++II) {
136           const SCEV *Distance = D->getDistance(II);
137           const SCEVConstant *SCEVConst =
138               dyn_cast_or_null<SCEVConstant>(Distance);
139           if (SCEVConst) {
140             const ConstantInt *CI = SCEVConst->getValue();
141             if (CI->isNegative())
142               Direction = '<';
143             else if (CI->isZero())
144               Direction = '=';
145             else
146               Direction = '>';
147             Dep.push_back(Direction);
148           } else if (D->isScalar(II)) {
149             Direction = 'S';
150             Dep.push_back(Direction);
151           } else {
152             unsigned Dir = D->getDirection(II);
153             if (Dir == Dependence::DVEntry::LT ||
154                 Dir == Dependence::DVEntry::LE)
155               Direction = '<';
156             else if (Dir == Dependence::DVEntry::GT ||
157                      Dir == Dependence::DVEntry::GE)
158               Direction = '>';
159             else if (Dir == Dependence::DVEntry::EQ)
160               Direction = '=';
161             else
162               Direction = '*';
163             Dep.push_back(Direction);
164           }
165         }
166         while (Dep.size() != Level) {
167           Dep.push_back('I');
168         }
169 
170         DepMatrix.push_back(Dep);
171         if (DepMatrix.size() > MaxMemInstrCount) {
172           LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
173                             << " dependencies inside loop\n");
174           return false;
175         }
176       }
177     }
178   }
179 
180   return true;
181 }
182 
183 // A loop is moved from index 'from' to an index 'to'. Update the Dependence
184 // matrix by exchanging the two columns.
interChangeDependencies(CharMatrix & DepMatrix,unsigned FromIndx,unsigned ToIndx)185 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
186                                     unsigned ToIndx) {
187   unsigned numRows = DepMatrix.size();
188   for (unsigned i = 0; i < numRows; ++i) {
189     char TmpVal = DepMatrix[i][ToIndx];
190     DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
191     DepMatrix[i][FromIndx] = TmpVal;
192   }
193 }
194 
195 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
196 // '>'
isOuterMostDepPositive(CharMatrix & DepMatrix,unsigned Row,unsigned Column)197 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
198                                    unsigned Column) {
199   for (unsigned i = 0; i <= Column; ++i) {
200     if (DepMatrix[Row][i] == '<')
201       return false;
202     if (DepMatrix[Row][i] == '>')
203       return true;
204   }
205   // All dependencies were '=','S' or 'I'
206   return false;
207 }
208 
209 // Checks if no dependence exist in the dependency matrix in Row before Column.
containsNoDependence(CharMatrix & DepMatrix,unsigned Row,unsigned Column)210 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
211                                  unsigned Column) {
212   for (unsigned i = 0; i < Column; ++i) {
213     if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
214         DepMatrix[Row][i] != 'I')
215       return false;
216   }
217   return true;
218 }
219 
validDepInterchange(CharMatrix & DepMatrix,unsigned Row,unsigned OuterLoopId,char InnerDep,char OuterDep)220 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
221                                 unsigned OuterLoopId, char InnerDep,
222                                 char OuterDep) {
223   if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
224     return false;
225 
226   if (InnerDep == OuterDep)
227     return true;
228 
229   // It is legal to interchange if and only if after interchange no row has a
230   // '>' direction as the leftmost non-'='.
231 
232   if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
233     return true;
234 
235   if (InnerDep == '<')
236     return true;
237 
238   if (InnerDep == '>') {
239     // If OuterLoopId represents outermost loop then interchanging will make the
240     // 1st dependency as '>'
241     if (OuterLoopId == 0)
242       return false;
243 
244     // If all dependencies before OuterloopId are '=','S'or 'I'. Then
245     // interchanging will result in this row having an outermost non '='
246     // dependency of '>'
247     if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
248       return true;
249   }
250 
251   return false;
252 }
253 
254 // Checks if it is legal to interchange 2 loops.
255 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
256 // if the direction matrix, after the same permutation is applied to its
257 // columns, has no ">" direction as the leftmost non-"=" direction in any row.
isLegalToInterChangeLoops(CharMatrix & DepMatrix,unsigned InnerLoopId,unsigned OuterLoopId)258 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
259                                       unsigned InnerLoopId,
260                                       unsigned OuterLoopId) {
261   unsigned NumRows = DepMatrix.size();
262   // For each row check if it is valid to interchange.
263   for (unsigned Row = 0; Row < NumRows; ++Row) {
264     char InnerDep = DepMatrix[Row][InnerLoopId];
265     char OuterDep = DepMatrix[Row][OuterLoopId];
266     if (InnerDep == '*' || OuterDep == '*')
267       return false;
268     if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
269       return false;
270   }
271   return true;
272 }
273 
populateWorklist(Loop & L,SmallVector<LoopVector,8> & V)274 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
275   LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
276                     << L.getHeader()->getParent()->getName() << " Loop: %"
277                     << L.getHeader()->getName() << '\n');
278   LoopVector LoopList;
279   Loop *CurrentLoop = &L;
280   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
281   while (!Vec->empty()) {
282     // The current loop has multiple subloops in it hence it is not tightly
283     // nested.
284     // Discard all loops above it added into Worklist.
285     if (Vec->size() != 1) {
286       LoopList.clear();
287       return;
288     }
289     LoopList.push_back(CurrentLoop);
290     CurrentLoop = Vec->front();
291     Vec = &CurrentLoop->getSubLoops();
292   }
293   LoopList.push_back(CurrentLoop);
294   V.push_back(std::move(LoopList));
295 }
296 
getInductionVariable(Loop * L,ScalarEvolution * SE)297 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
298   PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
299   if (InnerIndexVar)
300     return InnerIndexVar;
301   if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
302     return nullptr;
303   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
304     PHINode *PhiVar = cast<PHINode>(I);
305     Type *PhiTy = PhiVar->getType();
306     if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
307         !PhiTy->isPointerTy())
308       return nullptr;
309     const SCEVAddRecExpr *AddRec =
310         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
311     if (!AddRec || !AddRec->isAffine())
312       continue;
313     const SCEV *Step = AddRec->getStepRecurrence(*SE);
314     if (!isa<SCEVConstant>(Step))
315       continue;
316     // Found the induction variable.
317     // FIXME: Handle loops with more than one induction variable. Note that,
318     // currently, legality makes sure we have only one induction variable.
319     return PhiVar;
320   }
321   return nullptr;
322 }
323 
324 namespace {
325 
326 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
327 class LoopInterchangeLegality {
328 public:
LoopInterchangeLegality(Loop * Outer,Loop * Inner,ScalarEvolution * SE,LoopInfo * LI,DominatorTree * DT,bool PreserveLCSSA,OptimizationRemarkEmitter * ORE)329   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
330                           LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
331                           OptimizationRemarkEmitter *ORE)
332       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
333         PreserveLCSSA(PreserveLCSSA), ORE(ORE) {}
334 
335   /// Check if the loops can be interchanged.
336   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
337                            CharMatrix &DepMatrix);
338 
339   /// Check if the loop structure is understood. We do not handle triangular
340   /// loops for now.
341   bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
342 
343   bool currentLimitations();
344 
hasInnerLoopReduction()345   bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
346 
347 private:
348   bool tightlyNested(Loop *Outer, Loop *Inner);
349   bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
350   bool areAllUsesReductions(Instruction *Ins, Loop *L);
351   bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
352   bool findInductionAndReductions(Loop *L,
353                                   SmallVector<PHINode *, 8> &Inductions,
354                                   SmallVector<PHINode *, 8> &Reductions);
355 
356   Loop *OuterLoop;
357   Loop *InnerLoop;
358 
359   ScalarEvolution *SE;
360   LoopInfo *LI;
361   DominatorTree *DT;
362   bool PreserveLCSSA;
363 
364   /// Interface to emit optimization remarks.
365   OptimizationRemarkEmitter *ORE;
366 
367   bool InnerLoopHasReduction = false;
368 };
369 
370 /// LoopInterchangeProfitability checks if it is profitable to interchange the
371 /// loop.
372 class LoopInterchangeProfitability {
373 public:
LoopInterchangeProfitability(Loop * Outer,Loop * Inner,ScalarEvolution * SE,OptimizationRemarkEmitter * ORE)374   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
375                                OptimizationRemarkEmitter *ORE)
376       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
377 
378   /// Check if the loop interchange is profitable.
379   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
380                     CharMatrix &DepMatrix);
381 
382 private:
383   int getInstrOrderCost();
384 
385   Loop *OuterLoop;
386   Loop *InnerLoop;
387 
388   /// Scev analysis.
389   ScalarEvolution *SE;
390 
391   /// Interface to emit optimization remarks.
392   OptimizationRemarkEmitter *ORE;
393 };
394 
395 /// LoopInterchangeTransform interchanges the loop.
396 class LoopInterchangeTransform {
397 public:
LoopInterchangeTransform(Loop * Outer,Loop * Inner,ScalarEvolution * SE,LoopInfo * LI,DominatorTree * DT,BasicBlock * LoopNestExit,bool InnerLoopContainsReductions)398   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
399                            LoopInfo *LI, DominatorTree *DT,
400                            BasicBlock *LoopNestExit,
401                            bool InnerLoopContainsReductions)
402       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
403         LoopExit(LoopNestExit),
404         InnerLoopHasReduction(InnerLoopContainsReductions) {}
405 
406   /// Interchange OuterLoop and InnerLoop.
407   bool transform();
408   void restructureLoops(Loop *NewInner, Loop *NewOuter,
409                         BasicBlock *OrigInnerPreHeader,
410                         BasicBlock *OrigOuterPreHeader);
411   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
412 
413 private:
414   void splitInnerLoopLatch(Instruction *);
415   void splitInnerLoopHeader();
416   bool adjustLoopLinks();
417   void adjustLoopPreheaders();
418   bool adjustLoopBranches();
419   void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
420                            BasicBlock *NewPred);
421 
422   Loop *OuterLoop;
423   Loop *InnerLoop;
424 
425   /// Scev analysis.
426   ScalarEvolution *SE;
427 
428   LoopInfo *LI;
429   DominatorTree *DT;
430   BasicBlock *LoopExit;
431   bool InnerLoopHasReduction;
432 };
433 
434 // Main LoopInterchange Pass.
435 struct LoopInterchange : public FunctionPass {
436   static char ID;
437   ScalarEvolution *SE = nullptr;
438   LoopInfo *LI = nullptr;
439   DependenceInfo *DI = nullptr;
440   DominatorTree *DT = nullptr;
441   bool PreserveLCSSA;
442 
443   /// Interface to emit optimization remarks.
444   OptimizationRemarkEmitter *ORE;
445 
LoopInterchange__anon8deb94cf0211::LoopInterchange446   LoopInterchange() : FunctionPass(ID) {
447     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
448   }
449 
getAnalysisUsage__anon8deb94cf0211::LoopInterchange450   void getAnalysisUsage(AnalysisUsage &AU) const override {
451     AU.addRequired<ScalarEvolutionWrapperPass>();
452     AU.addRequired<AAResultsWrapperPass>();
453     AU.addRequired<DominatorTreeWrapperPass>();
454     AU.addRequired<LoopInfoWrapperPass>();
455     AU.addRequired<DependenceAnalysisWrapperPass>();
456     AU.addRequiredID(LoopSimplifyID);
457     AU.addRequiredID(LCSSAID);
458     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
459 
460     AU.addPreserved<DominatorTreeWrapperPass>();
461     AU.addPreserved<LoopInfoWrapperPass>();
462   }
463 
runOnFunction__anon8deb94cf0211::LoopInterchange464   bool runOnFunction(Function &F) override {
465     if (skipFunction(F))
466       return false;
467 
468     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
469     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
470     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
471     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
472     ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
473     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
474 
475     // Build up a worklist of loop pairs to analyze.
476     SmallVector<LoopVector, 8> Worklist;
477 
478     for (Loop *L : *LI)
479       populateWorklist(*L, Worklist);
480 
481     LLVM_DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
482     bool Changed = true;
483     while (!Worklist.empty()) {
484       LoopVector LoopList = Worklist.pop_back_val();
485       Changed = processLoopList(LoopList, F);
486     }
487     return Changed;
488   }
489 
isComputableLoopNest__anon8deb94cf0211::LoopInterchange490   bool isComputableLoopNest(LoopVector LoopList) {
491     for (Loop *L : LoopList) {
492       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
493       if (ExitCountOuter == SE->getCouldNotCompute()) {
494         LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
495         return false;
496       }
497       if (L->getNumBackEdges() != 1) {
498         LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
499         return false;
500       }
501       if (!L->getExitingBlock()) {
502         LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
503         return false;
504       }
505     }
506     return true;
507   }
508 
selectLoopForInterchange__anon8deb94cf0211::LoopInterchange509   unsigned selectLoopForInterchange(const LoopVector &LoopList) {
510     // TODO: Add a better heuristic to select the loop to be interchanged based
511     // on the dependence matrix. Currently we select the innermost loop.
512     return LoopList.size() - 1;
513   }
514 
processLoopList__anon8deb94cf0211::LoopInterchange515   bool processLoopList(LoopVector LoopList, Function &F) {
516     bool Changed = false;
517     unsigned LoopNestDepth = LoopList.size();
518     if (LoopNestDepth < 2) {
519       LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
520       return false;
521     }
522     if (LoopNestDepth > MaxLoopNestDepth) {
523       LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
524                         << MaxLoopNestDepth << "\n");
525       return false;
526     }
527     if (!isComputableLoopNest(LoopList)) {
528       LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
529       return false;
530     }
531 
532     LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
533                       << "\n");
534 
535     CharMatrix DependencyMatrix;
536     Loop *OuterMostLoop = *(LoopList.begin());
537     if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
538                                   OuterMostLoop, DI)) {
539       LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
540       return false;
541     }
542 #ifdef DUMP_DEP_MATRICIES
543     LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
544     printDepMatrix(DependencyMatrix);
545 #endif
546 
547     // Get the Outermost loop exit.
548     BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
549     if (!LoopNestExit) {
550       LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
551       return false;
552     }
553 
554     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
555     // Move the selected loop outwards to the best possible position.
556     for (unsigned i = SelecLoopId; i > 0; i--) {
557       bool Interchanged =
558           processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
559       if (!Interchanged)
560         return Changed;
561       // Loops interchanged reflect the same in LoopList
562       std::swap(LoopList[i - 1], LoopList[i]);
563 
564       // Update the DependencyMatrix
565       interChangeDependencies(DependencyMatrix, i, i - 1);
566 #ifdef DUMP_DEP_MATRICIES
567       LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
568       printDepMatrix(DependencyMatrix);
569 #endif
570       Changed |= Interchanged;
571     }
572     return Changed;
573   }
574 
processLoop__anon8deb94cf0211::LoopInterchange575   bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
576                    unsigned OuterLoopId, BasicBlock *LoopNestExit,
577                    std::vector<std::vector<char>> &DependencyMatrix) {
578     LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
579                       << " and OuterLoopId = " << OuterLoopId << "\n");
580     Loop *InnerLoop = LoopList[InnerLoopId];
581     Loop *OuterLoop = LoopList[OuterLoopId];
582 
583     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
584                                 PreserveLCSSA, ORE);
585     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
586       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
587       return false;
588     }
589     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
590     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
591     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
592       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
593       return false;
594     }
595 
596     ORE->emit([&]() {
597       return OptimizationRemark(DEBUG_TYPE, "Interchanged",
598                                 InnerLoop->getStartLoc(),
599                                 InnerLoop->getHeader())
600              << "Loop interchanged with enclosing loop.";
601     });
602 
603     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
604                                  LoopNestExit, LIL.hasInnerLoopReduction());
605     LIT.transform();
606     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
607     LoopsInterchanged++;
608     return true;
609   }
610 };
611 
612 } // end anonymous namespace
613 
areAllUsesReductions(Instruction * Ins,Loop * L)614 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
615   return llvm::none_of(Ins->users(), [=](User *U) -> bool {
616     auto *UserIns = dyn_cast<PHINode>(U);
617     RecurrenceDescriptor RD;
618     return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
619   });
620 }
621 
containsUnsafeInstructionsInHeader(BasicBlock * BB)622 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
623     BasicBlock *BB) {
624   for (Instruction &I : *BB) {
625     // Load corresponding to reduction PHI's are safe while concluding if
626     // tightly nested.
627     if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
628       if (!areAllUsesReductions(L, InnerLoop))
629         return true;
630     } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
631       return true;
632   }
633   return false;
634 }
635 
containsUnsafeInstructionsInLatch(BasicBlock * BB)636 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
637     BasicBlock *BB) {
638   for (Instruction &I : *BB) {
639     // Stores corresponding to reductions are safe while concluding if tightly
640     // nested.
641     if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
642       if (!isa<PHINode>(L->getOperand(0)))
643         return true;
644     } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
645       return true;
646   }
647   return false;
648 }
649 
tightlyNested(Loop * OuterLoop,Loop * InnerLoop)650 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
651   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
652   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
653   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
654 
655   LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
656 
657   // A perfectly nested loop will not have any branch in between the outer and
658   // inner block i.e. outer header will branch to either inner preheader and
659   // outerloop latch.
660   BranchInst *OuterLoopHeaderBI =
661       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
662   if (!OuterLoopHeaderBI)
663     return false;
664 
665   for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
666     if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
667       return false;
668 
669   LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
670   // We do not have any basic block in between now make sure the outer header
671   // and outer loop latch doesn't contain any unsafe instructions.
672   if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
673       containsUnsafeInstructionsInLatch(OuterLoopLatch))
674     return false;
675 
676   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
677   // We have a perfect loop nest.
678   return true;
679 }
680 
isLoopStructureUnderstood(PHINode * InnerInduction)681 bool LoopInterchangeLegality::isLoopStructureUnderstood(
682     PHINode *InnerInduction) {
683   unsigned Num = InnerInduction->getNumOperands();
684   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
685   for (unsigned i = 0; i < Num; ++i) {
686     Value *Val = InnerInduction->getOperand(i);
687     if (isa<Constant>(Val))
688       continue;
689     Instruction *I = dyn_cast<Instruction>(Val);
690     if (!I)
691       return false;
692     // TODO: Handle triangular loops.
693     // e.g. for(int i=0;i<N;i++)
694     //        for(int j=i;j<N;j++)
695     unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
696     if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
697             InnerLoopPreheader &&
698         !OuterLoop->isLoopInvariant(I)) {
699       return false;
700     }
701   }
702   return true;
703 }
704 
findInductionAndReductions(Loop * L,SmallVector<PHINode *,8> & Inductions,SmallVector<PHINode *,8> & Reductions)705 bool LoopInterchangeLegality::findInductionAndReductions(
706     Loop *L, SmallVector<PHINode *, 8> &Inductions,
707     SmallVector<PHINode *, 8> &Reductions) {
708   if (!L->getLoopLatch() || !L->getLoopPredecessor())
709     return false;
710   for (PHINode &PHI : L->getHeader()->phis()) {
711     RecurrenceDescriptor RD;
712     InductionDescriptor ID;
713     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
714       Inductions.push_back(&PHI);
715     else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
716       Reductions.push_back(&PHI);
717     else {
718       LLVM_DEBUG(
719           dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
720       return false;
721     }
722   }
723   return true;
724 }
725 
containsSafePHI(BasicBlock * Block,bool isOuterLoopExitBlock)726 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
727   for (PHINode &PHI : Block->phis()) {
728     // Reduction lcssa phi will have only 1 incoming block that from loop latch.
729     if (PHI.getNumIncomingValues() > 1)
730       return false;
731     Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
732     if (!Ins)
733       return false;
734     // Incoming value for lcssa phi's in outer loop exit can only be inner loop
735     // exits lcssa phi else it would not be tightly nested.
736     if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
737       return false;
738   }
739   return true;
740 }
741 
742 // This function indicates the current limitations in the transform as a result
743 // of which we do not proceed.
currentLimitations()744 bool LoopInterchangeLegality::currentLimitations() {
745   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
746   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
747 
748   // transform currently expects the loop latches to also be the exiting
749   // blocks.
750   if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
751       OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
752       !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
753       !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
754     LLVM_DEBUG(
755         dbgs() << "Loops where the latch is not the exiting block are not"
756                << " supported currently.\n");
757     ORE->emit([&]() {
758       return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
759                                       OuterLoop->getStartLoc(),
760                                       OuterLoop->getHeader())
761              << "Loops where the latch is not the exiting block cannot be"
762                 " interchange currently.";
763     });
764     return true;
765   }
766 
767   PHINode *InnerInductionVar;
768   SmallVector<PHINode *, 8> Inductions;
769   SmallVector<PHINode *, 8> Reductions;
770   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
771     LLVM_DEBUG(
772         dbgs() << "Only inner loops with induction or reduction PHI nodes "
773                << "are supported currently.\n");
774     ORE->emit([&]() {
775       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
776                                       InnerLoop->getStartLoc(),
777                                       InnerLoop->getHeader())
778              << "Only inner loops with induction or reduction PHI nodes can be"
779                 " interchange currently.";
780     });
781     return true;
782   }
783 
784   // TODO: Currently we handle only loops with 1 induction variable.
785   if (Inductions.size() != 1) {
786     LLVM_DEBUG(
787         dbgs() << "We currently only support loops with 1 induction variable."
788                << "Failed to interchange due to current limitation\n");
789     ORE->emit([&]() {
790       return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
791                                       InnerLoop->getStartLoc(),
792                                       InnerLoop->getHeader())
793              << "Only inner loops with 1 induction variable can be "
794                 "interchanged currently.";
795     });
796     return true;
797   }
798   if (Reductions.size() > 0)
799     InnerLoopHasReduction = true;
800 
801   InnerInductionVar = Inductions.pop_back_val();
802   Reductions.clear();
803   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
804     LLVM_DEBUG(
805         dbgs() << "Only outer loops with induction or reduction PHI nodes "
806                << "are supported currently.\n");
807     ORE->emit([&]() {
808       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
809                                       OuterLoop->getStartLoc(),
810                                       OuterLoop->getHeader())
811              << "Only outer loops with induction or reduction PHI nodes can be"
812                 " interchanged currently.";
813     });
814     return true;
815   }
816 
817   // Outer loop cannot have reduction because then loops will not be tightly
818   // nested.
819   if (!Reductions.empty()) {
820     LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
821                       << "currently.\n");
822     ORE->emit([&]() {
823       return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
824                                       OuterLoop->getStartLoc(),
825                                       OuterLoop->getHeader())
826              << "Outer loops with reductions cannot be interchangeed "
827                 "currently.";
828     });
829     return true;
830   }
831   // TODO: Currently we handle only loops with 1 induction variable.
832   if (Inductions.size() != 1) {
833     LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
834                       << "supported currently.\n");
835     ORE->emit([&]() {
836       return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
837                                       OuterLoop->getStartLoc(),
838                                       OuterLoop->getHeader())
839              << "Only outer loops with 1 induction variable can be "
840                 "interchanged currently.";
841     });
842     return true;
843   }
844 
845   // TODO: Triangular loops are not handled for now.
846   if (!isLoopStructureUnderstood(InnerInductionVar)) {
847     LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
848     ORE->emit([&]() {
849       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
850                                       InnerLoop->getStartLoc(),
851                                       InnerLoop->getHeader())
852              << "Inner loop structure not understood currently.";
853     });
854     return true;
855   }
856 
857   // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
858   BasicBlock *InnerExit = InnerLoop->getExitBlock();
859   if (!containsSafePHI(InnerExit, false)) {
860     LLVM_DEBUG(
861         dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
862     ORE->emit([&]() {
863       return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
864                                       InnerLoop->getStartLoc(),
865                                       InnerLoop->getHeader())
866              << "Only inner loops with LCSSA PHIs can be interchange "
867                 "currently.";
868     });
869     return true;
870   }
871 
872   // TODO: Current limitation: Since we split the inner loop latch at the point
873   // were induction variable is incremented (induction.next); We cannot have
874   // more than 1 user of induction.next since it would result in broken code
875   // after split.
876   // e.g.
877   // for(i=0;i<N;i++) {
878   //    for(j = 0;j<M;j++) {
879   //      A[j+1][i+2] = A[j][i]+k;
880   //  }
881   // }
882   Instruction *InnerIndexVarInc = nullptr;
883   if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
884     InnerIndexVarInc =
885         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
886   else
887     InnerIndexVarInc =
888         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
889 
890   if (!InnerIndexVarInc) {
891     LLVM_DEBUG(
892         dbgs() << "Did not find an instruction to increment the induction "
893                << "variable.\n");
894     ORE->emit([&]() {
895       return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
896                                       InnerLoop->getStartLoc(),
897                                       InnerLoop->getHeader())
898              << "The inner loop does not increment the induction variable.";
899     });
900     return true;
901   }
902 
903   // Since we split the inner loop latch on this induction variable. Make sure
904   // we do not have any instruction between the induction variable and branch
905   // instruction.
906 
907   bool FoundInduction = false;
908   for (const Instruction &I :
909        llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
910     if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
911         isa<ZExtInst>(I))
912       continue;
913 
914     // We found an instruction. If this is not induction variable then it is not
915     // safe to split this loop latch.
916     if (!I.isIdenticalTo(InnerIndexVarInc)) {
917       LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
918                         << "variable increment and branch.\n");
919       ORE->emit([&]() {
920         return OptimizationRemarkMissed(
921                    DEBUG_TYPE, "UnsupportedInsBetweenInduction",
922                    InnerLoop->getStartLoc(), InnerLoop->getHeader())
923                << "Found unsupported instruction between induction variable "
924                   "increment and branch.";
925       });
926       return true;
927     }
928 
929     FoundInduction = true;
930     break;
931   }
932   // The loop latch ended and we didn't find the induction variable return as
933   // current limitation.
934   if (!FoundInduction) {
935     LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
936     ORE->emit([&]() {
937       return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
938                                       InnerLoop->getStartLoc(),
939                                       InnerLoop->getHeader())
940              << "Did not find the induction variable.";
941     });
942     return true;
943   }
944   return false;
945 }
946 
947 // We currently support LCSSA PHI nodes in the outer loop exit, if their
948 // incoming values do not come from the outer loop latch or if the
949 // outer loop latch has a single predecessor. In that case, the value will
950 // be available if both the inner and outer loop conditions are true, which
951 // will still be true after interchanging. If we have multiple predecessor,
952 // that may not be the case, e.g. because the outer loop latch may be executed
953 // if the inner loop is not executed.
areLoopExitPHIsSupported(Loop * OuterLoop,Loop * InnerLoop)954 static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
955   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
956   for (PHINode &PHI : LoopNestExit->phis()) {
957     //  FIXME: We currently are not able to detect floating point reductions
958     //         and have to use floating point PHIs as a proxy to prevent
959     //         interchanging in the presence of floating point reductions.
960     if (PHI.getType()->isFloatingPointTy())
961       return false;
962     for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
963      Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
964      if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
965        continue;
966 
967      // The incoming value is defined in the outer loop latch. Currently we
968      // only support that in case the outer loop latch has a single predecessor.
969      // This guarantees that the outer loop latch is executed if and only if
970      // the inner loop is executed (because tightlyNested() guarantees that the
971      // outer loop header only branches to the inner loop or the outer loop
972      // latch).
973      // FIXME: We could weaken this logic and allow multiple predecessors,
974      //        if the values are produced outside the loop latch. We would need
975      //        additional logic to update the PHI nodes in the exit block as
976      //        well.
977      if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
978        return false;
979     }
980   }
981   return true;
982 }
983 
canInterchangeLoops(unsigned InnerLoopId,unsigned OuterLoopId,CharMatrix & DepMatrix)984 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
985                                                   unsigned OuterLoopId,
986                                                   CharMatrix &DepMatrix) {
987   if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
988     LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
989                       << " and OuterLoopId = " << OuterLoopId
990                       << " due to dependence\n");
991     ORE->emit([&]() {
992       return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
993                                       InnerLoop->getStartLoc(),
994                                       InnerLoop->getHeader())
995              << "Cannot interchange loops due to dependences.";
996     });
997     return false;
998   }
999   // Check if outer and inner loop contain legal instructions only.
1000   for (auto *BB : OuterLoop->blocks())
1001     for (Instruction &I : BB->instructionsWithoutDebug())
1002       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1003         // readnone functions do not prevent interchanging.
1004         if (CI->doesNotReadMemory())
1005           continue;
1006         LLVM_DEBUG(
1007             dbgs() << "Loops with call instructions cannot be interchanged "
1008                    << "safely.");
1009         ORE->emit([&]() {
1010           return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1011                                           CI->getDebugLoc(),
1012                                           CI->getParent())
1013                  << "Cannot interchange loops due to call instruction.";
1014         });
1015 
1016         return false;
1017       }
1018 
1019   // Create unique Preheaders if we already do not have one.
1020   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1021   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1022 
1023   // Create  a unique outer preheader -
1024   // 1) If OuterLoop preheader is not present.
1025   // 2) If OuterLoop Preheader is same as OuterLoop Header
1026   // 3) If OuterLoop Preheader is same as Header of the previous loop.
1027   // 4) If OuterLoop Preheader is Entry node.
1028   if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
1029       isa<PHINode>(OuterLoopPreHeader->begin()) ||
1030       !OuterLoopPreHeader->getUniquePredecessor()) {
1031     OuterLoopPreHeader =
1032         InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
1033   }
1034 
1035   if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
1036       InnerLoopPreHeader == OuterLoop->getHeader()) {
1037     InnerLoopPreHeader =
1038         InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
1039   }
1040 
1041   // TODO: The loops could not be interchanged due to current limitations in the
1042   // transform module.
1043   if (currentLimitations()) {
1044     LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1045     return false;
1046   }
1047 
1048   // Check if the loops are tightly nested.
1049   if (!tightlyNested(OuterLoop, InnerLoop)) {
1050     LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1051     ORE->emit([&]() {
1052       return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1053                                       InnerLoop->getStartLoc(),
1054                                       InnerLoop->getHeader())
1055              << "Cannot interchange loops because they are not tightly "
1056                 "nested.";
1057     });
1058     return false;
1059   }
1060 
1061   if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1062     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1063     ORE->emit([&]() {
1064       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1065                                       OuterLoop->getStartLoc(),
1066                                       OuterLoop->getHeader())
1067              << "Found unsupported PHI node in loop exit.";
1068     });
1069     return false;
1070   }
1071 
1072   return true;
1073 }
1074 
getInstrOrderCost()1075 int LoopInterchangeProfitability::getInstrOrderCost() {
1076   unsigned GoodOrder, BadOrder;
1077   BadOrder = GoodOrder = 0;
1078   for (BasicBlock *BB : InnerLoop->blocks()) {
1079     for (Instruction &Ins : *BB) {
1080       if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1081         unsigned NumOp = GEP->getNumOperands();
1082         bool FoundInnerInduction = false;
1083         bool FoundOuterInduction = false;
1084         for (unsigned i = 0; i < NumOp; ++i) {
1085           const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1086           const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1087           if (!AR)
1088             continue;
1089 
1090           // If we find the inner induction after an outer induction e.g.
1091           // for(int i=0;i<N;i++)
1092           //   for(int j=0;j<N;j++)
1093           //     A[i][j] = A[i-1][j-1]+k;
1094           // then it is a good order.
1095           if (AR->getLoop() == InnerLoop) {
1096             // We found an InnerLoop induction after OuterLoop induction. It is
1097             // a good order.
1098             FoundInnerInduction = true;
1099             if (FoundOuterInduction) {
1100               GoodOrder++;
1101               break;
1102             }
1103           }
1104           // If we find the outer induction after an inner induction e.g.
1105           // for(int i=0;i<N;i++)
1106           //   for(int j=0;j<N;j++)
1107           //     A[j][i] = A[j-1][i-1]+k;
1108           // then it is a bad order.
1109           if (AR->getLoop() == OuterLoop) {
1110             // We found an OuterLoop induction after InnerLoop induction. It is
1111             // a bad order.
1112             FoundOuterInduction = true;
1113             if (FoundInnerInduction) {
1114               BadOrder++;
1115               break;
1116             }
1117           }
1118         }
1119       }
1120     }
1121   }
1122   return GoodOrder - BadOrder;
1123 }
1124 
isProfitableForVectorization(unsigned InnerLoopId,unsigned OuterLoopId,CharMatrix & DepMatrix)1125 static bool isProfitableForVectorization(unsigned InnerLoopId,
1126                                          unsigned OuterLoopId,
1127                                          CharMatrix &DepMatrix) {
1128   // TODO: Improve this heuristic to catch more cases.
1129   // If the inner loop is loop independent or doesn't carry any dependency it is
1130   // profitable to move this to outer position.
1131   for (auto &Row : DepMatrix) {
1132     if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
1133       return false;
1134     // TODO: We need to improve this heuristic.
1135     if (Row[OuterLoopId] != '=')
1136       return false;
1137   }
1138   // If outer loop has dependence and inner loop is loop independent then it is
1139   // profitable to interchange to enable parallelism.
1140   // If there are no dependences, interchanging will not improve anything.
1141   return !DepMatrix.empty();
1142 }
1143 
isProfitable(unsigned InnerLoopId,unsigned OuterLoopId,CharMatrix & DepMatrix)1144 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1145                                                 unsigned OuterLoopId,
1146                                                 CharMatrix &DepMatrix) {
1147   // TODO: Add better profitability checks.
1148   // e.g
1149   // 1) Construct dependency matrix and move the one with no loop carried dep
1150   //    inside to enable vectorization.
1151 
1152   // This is rough cost estimation algorithm. It counts the good and bad order
1153   // of induction variables in the instruction and allows reordering if number
1154   // of bad orders is more than good.
1155   int Cost = getInstrOrderCost();
1156   LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1157   if (Cost < -LoopInterchangeCostThreshold)
1158     return true;
1159 
1160   // It is not profitable as per current cache profitability model. But check if
1161   // we can move this loop outside to improve parallelism.
1162   if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1163     return true;
1164 
1165   ORE->emit([&]() {
1166     return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1167                                     InnerLoop->getStartLoc(),
1168                                     InnerLoop->getHeader())
1169            << "Interchanging loops is too costly (cost="
1170            << ore::NV("Cost", Cost) << ", threshold="
1171            << ore::NV("Threshold", LoopInterchangeCostThreshold)
1172            << ") and it does not improve parallelism.";
1173   });
1174   return false;
1175 }
1176 
removeChildLoop(Loop * OuterLoop,Loop * InnerLoop)1177 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1178                                                Loop *InnerLoop) {
1179   for (Loop *L : *OuterLoop)
1180     if (L == InnerLoop) {
1181       OuterLoop->removeChildLoop(L);
1182       return;
1183     }
1184   llvm_unreachable("Couldn't find loop");
1185 }
1186 
1187 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1188 /// new inner and outer loop after interchanging: NewInner is the original
1189 /// outer loop and NewOuter is the original inner loop.
1190 ///
1191 /// Before interchanging, we have the following structure
1192 /// Outer preheader
1193 //  Outer header
1194 //    Inner preheader
1195 //    Inner header
1196 //      Inner body
1197 //      Inner latch
1198 //   outer bbs
1199 //   Outer latch
1200 //
1201 // After interchanging:
1202 // Inner preheader
1203 // Inner header
1204 //   Outer preheader
1205 //   Outer header
1206 //     Inner body
1207 //     outer bbs
1208 //     Outer latch
1209 //   Inner latch
restructureLoops(Loop * NewInner,Loop * NewOuter,BasicBlock * OrigInnerPreHeader,BasicBlock * OrigOuterPreHeader)1210 void LoopInterchangeTransform::restructureLoops(
1211     Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1212     BasicBlock *OrigOuterPreHeader) {
1213   Loop *OuterLoopParent = OuterLoop->getParentLoop();
1214   // The original inner loop preheader moves from the new inner loop to
1215   // the parent loop, if there is one.
1216   NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1217   LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1218 
1219   // Switch the loop levels.
1220   if (OuterLoopParent) {
1221     // Remove the loop from its parent loop.
1222     removeChildLoop(OuterLoopParent, NewInner);
1223     removeChildLoop(NewInner, NewOuter);
1224     OuterLoopParent->addChildLoop(NewOuter);
1225   } else {
1226     removeChildLoop(NewInner, NewOuter);
1227     LI->changeTopLevelLoop(NewInner, NewOuter);
1228   }
1229   while (!NewOuter->empty())
1230     NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1231   NewOuter->addChildLoop(NewInner);
1232 
1233   // BBs from the original inner loop.
1234   SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1235 
1236   // Add BBs from the original outer loop to the original inner loop (excluding
1237   // BBs already in inner loop)
1238   for (BasicBlock *BB : NewInner->blocks())
1239     if (LI->getLoopFor(BB) == NewInner)
1240       NewOuter->addBlockEntry(BB);
1241 
1242   // Now remove inner loop header and latch from the new inner loop and move
1243   // other BBs (the loop body) to the new inner loop.
1244   BasicBlock *OuterHeader = NewOuter->getHeader();
1245   BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1246   for (BasicBlock *BB : OrigInnerBBs) {
1247     // Nothing will change for BBs in child loops.
1248     if (LI->getLoopFor(BB) != NewOuter)
1249       continue;
1250     // Remove the new outer loop header and latch from the new inner loop.
1251     if (BB == OuterHeader || BB == OuterLatch)
1252       NewInner->removeBlockFromLoop(BB);
1253     else
1254       LI->changeLoopFor(BB, NewInner);
1255   }
1256 
1257   // The preheader of the original outer loop becomes part of the new
1258   // outer loop.
1259   NewOuter->addBlockEntry(OrigOuterPreHeader);
1260   LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1261 }
1262 
transform()1263 bool LoopInterchangeTransform::transform() {
1264   bool Transformed = false;
1265   Instruction *InnerIndexVar;
1266 
1267   if (InnerLoop->getSubLoops().empty()) {
1268     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1269     LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n");
1270     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1271     if (!InductionPHI) {
1272       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1273       return false;
1274     }
1275 
1276     if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1277       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1278     else
1279       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1280 
1281     // Ensure that InductionPHI is the first Phi node.
1282     if (&InductionPHI->getParent()->front() != InductionPHI)
1283       InductionPHI->moveBefore(&InductionPHI->getParent()->front());
1284 
1285     // Split at the place were the induction variable is
1286     // incremented/decremented.
1287     // TODO: This splitting logic may not work always. Fix this.
1288     splitInnerLoopLatch(InnerIndexVar);
1289     LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n");
1290 
1291     // Splits the inner loops phi nodes out into a separate basic block.
1292     BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1293     SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1294     LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1295   }
1296 
1297   Transformed |= adjustLoopLinks();
1298   if (!Transformed) {
1299     LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1300     return false;
1301   }
1302 
1303   return true;
1304 }
1305 
splitInnerLoopLatch(Instruction * Inc)1306 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
1307   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1308   BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
1309   InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
1310 }
1311 
1312 /// \brief Move all instructions except the terminator from FromBB right before
1313 /// InsertBefore
moveBBContents(BasicBlock * FromBB,Instruction * InsertBefore)1314 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1315   auto &ToList = InsertBefore->getParent()->getInstList();
1316   auto &FromList = FromBB->getInstList();
1317 
1318   ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1319                 FromBB->getTerminator()->getIterator());
1320 }
1321 
updateIncomingBlock(BasicBlock * CurrBlock,BasicBlock * OldPred,BasicBlock * NewPred)1322 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
1323                                                    BasicBlock *OldPred,
1324                                                    BasicBlock *NewPred) {
1325   for (PHINode &PHI : CurrBlock->phis()) {
1326     unsigned Num = PHI.getNumIncomingValues();
1327     for (unsigned i = 0; i < Num; ++i) {
1328       if (PHI.getIncomingBlock(i) == OldPred)
1329         PHI.setIncomingBlock(i, NewPred);
1330     }
1331   }
1332 }
1333 
1334 /// Update BI to jump to NewBB instead of OldBB. Records updates to
1335 /// the dominator tree in DTUpdates, if DT should be preserved.
updateSuccessor(BranchInst * BI,BasicBlock * OldBB,BasicBlock * NewBB,std::vector<DominatorTree::UpdateType> & DTUpdates)1336 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1337                             BasicBlock *NewBB,
1338                             std::vector<DominatorTree::UpdateType> &DTUpdates) {
1339   assert(llvm::count_if(BI->successors(),
1340                         [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 &&
1341          "BI must jump to OldBB at most once.");
1342   for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) {
1343     if (BI->getSuccessor(i) == OldBB) {
1344       BI->setSuccessor(i, NewBB);
1345 
1346       DTUpdates.push_back(
1347           {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1348       DTUpdates.push_back(
1349           {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1350       break;
1351     }
1352   }
1353 }
1354 
adjustLoopBranches()1355 bool LoopInterchangeTransform::adjustLoopBranches() {
1356   LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1357   std::vector<DominatorTree::UpdateType> DTUpdates;
1358 
1359   // Adjust the loop preheader
1360   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1361   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1362   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1363   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1364   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1365   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1366   BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1367   BasicBlock *InnerLoopLatchPredecessor =
1368       InnerLoopLatch->getUniquePredecessor();
1369   BasicBlock *InnerLoopLatchSuccessor;
1370   BasicBlock *OuterLoopLatchSuccessor;
1371 
1372   BranchInst *OuterLoopLatchBI =
1373       dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1374   BranchInst *InnerLoopLatchBI =
1375       dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1376   BranchInst *OuterLoopHeaderBI =
1377       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1378   BranchInst *InnerLoopHeaderBI =
1379       dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1380 
1381   if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1382       !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1383       !InnerLoopHeaderBI)
1384     return false;
1385 
1386   BranchInst *InnerLoopLatchPredecessorBI =
1387       dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1388   BranchInst *OuterLoopPredecessorBI =
1389       dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1390 
1391   if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1392     return false;
1393   BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1394   if (!InnerLoopHeaderSuccessor)
1395     return false;
1396 
1397   // Adjust Loop Preheader and headers
1398   updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1399                   InnerLoopPreHeader, DTUpdates);
1400   updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates);
1401   updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1402                   InnerLoopHeaderSuccessor, DTUpdates);
1403 
1404   // Adjust reduction PHI's now that the incoming block has changed.
1405   updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
1406                       OuterLoopHeader);
1407 
1408   updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1409                   OuterLoopPreHeader, DTUpdates);
1410 
1411   // -------------Adjust loop latches-----------
1412   if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1413     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1414   else
1415     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1416 
1417   updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1418                   InnerLoopLatchSuccessor, DTUpdates);
1419 
1420   // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
1421   // the value and remove this PHI node from inner loop.
1422   SmallVector<PHINode *, 8> LcssaVec;
1423   for (PHINode &P : InnerLoopLatchSuccessor->phis())
1424     LcssaVec.push_back(&P);
1425 
1426   for (PHINode *P : LcssaVec) {
1427     Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
1428     P->replaceAllUsesWith(Incoming);
1429     P->eraseFromParent();
1430   }
1431 
1432   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1433     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1434   else
1435     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1436 
1437   updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1438                   OuterLoopLatchSuccessor, DTUpdates);
1439   updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1440                   DTUpdates);
1441 
1442   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
1443 
1444   DT->applyUpdates(DTUpdates);
1445   restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1446                    OuterLoopPreHeader);
1447 
1448   // Now update the reduction PHIs in the inner and outer loop headers.
1449   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1450   for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
1451     InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
1452   for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
1453     OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
1454 
1455   for (PHINode *PHI : OuterLoopPHIs)
1456     PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1457 
1458   // Move the PHI nodes from the inner loop header to the outer loop header.
1459   // We have to deal with one kind of PHI nodes:
1460   //  1) PHI nodes that are part of inner loop-only reductions.
1461   // We only have to move the PHI node and update the incoming blocks.
1462   for (PHINode *PHI : InnerLoopPHIs) {
1463     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1464     for (BasicBlock *InBB : PHI->blocks()) {
1465       if (InnerLoop->contains(InBB))
1466         continue;
1467 
1468       assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
1469              "Unexpected incoming PHI node, reductions in outer loop are not "
1470              "supported yet");
1471       PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
1472       PHI->eraseFromParent();
1473       break;
1474     }
1475   }
1476 
1477   // Update the incoming blocks for moved PHI nodes.
1478   updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
1479   updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch);
1480   updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader);
1481   updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch);
1482 
1483   return true;
1484 }
1485 
adjustLoopPreheaders()1486 void LoopInterchangeTransform::adjustLoopPreheaders() {
1487   // We have interchanged the preheaders so we need to interchange the data in
1488   // the preheader as well.
1489   // This is because the content of inner preheader was previously executed
1490   // inside the outer loop.
1491   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1492   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1493   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1494   BranchInst *InnerTermBI =
1495       cast<BranchInst>(InnerLoopPreHeader->getTerminator());
1496 
1497   // These instructions should now be executed inside the loop.
1498   // Move instruction into a new block after outer header.
1499   moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
1500   // These instructions were not executed previously in the loop so move them to
1501   // the older inner loop preheader.
1502   moveBBContents(OuterLoopPreHeader, InnerTermBI);
1503 }
1504 
adjustLoopLinks()1505 bool LoopInterchangeTransform::adjustLoopLinks() {
1506   // Adjust all branches in the inner and outer loop.
1507   bool Changed = adjustLoopBranches();
1508   if (Changed)
1509     adjustLoopPreheaders();
1510   return Changed;
1511 }
1512 
1513 char LoopInterchange::ID = 0;
1514 
1515 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
1516                       "Interchanges loops for cache reuse", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)1517 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1518 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
1519 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1520 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
1521 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1522 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
1523 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
1524 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
1525 
1526 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
1527                     "Interchanges loops for cache reuse", false, false)
1528 
1529 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
1530