1 //===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the Cfg class.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "IceCfg.h"
16 
17 #include "IceAssembler.h"
18 #include "IceBitVector.h"
19 #include "IceCfgNode.h"
20 #include "IceClFlags.h"
21 #include "IceDefs.h"
22 #include "IceELFObjectWriter.h"
23 #include "IceGlobalInits.h"
24 #include "IceInst.h"
25 #include "IceInstVarIter.h"
26 #include "IceInstrumentation.h"
27 #include "IceLiveness.h"
28 #include "IceLoopAnalyzer.h"
29 #include "IceOperand.h"
30 #include "IceTargetLowering.h"
31 
32 #include <memory>
33 #include <utility>
34 
35 namespace Ice {
36 
Cfg(GlobalContext * Ctx,uint32_t SequenceNumber)37 Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)
38     : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber),
39       VMask(getFlags().getVerbose()), FunctionName(),
40       NextInstNumber(Inst::NumberInitial), Live(nullptr) {
41   NodeStrings.reset(new StringPool);
42   VarStrings.reset(new StringPool);
43   CfgLocalAllocatorScope _(this);
44   Target = TargetLowering::createLowering(getFlags().getTargetArch(), this);
45   VMetadata.reset(new VariablesMetadata(this));
46   TargetAssembler = Target->createAssembler();
47 }
48 
~Cfg()49 Cfg::~Cfg() {
50   assert(CfgAllocatorTraits::current() == nullptr);
51   if (getFlags().getDumpStrings()) {
52     OstreamLocker _(Ctx);
53     Ostream &Str = Ctx->getStrDump();
54     getNodeStrings()->dump(Str);
55     getVarStrings()->dump(Str);
56   }
57 }
58 
59 // Called in the initalizer list of Cfg's constructor to create the Allocator
60 // and set it as TLS before any other member fields are constructed, since they
61 // may depend on it.
createAllocator()62 ArenaAllocator *Cfg::createAllocator() {
63   ArenaAllocator *Allocator = new ArenaAllocator();
64   CfgAllocatorTraits::set_current(Allocator);
65   return Allocator;
66 }
67 
68 /// Create a string like "foo(i=123:b=9)" indicating the function name, number
69 /// of high-level instructions, and number of basic blocks.  This string is only
70 /// used for dumping and other diagnostics, and the idea is that given a set of
71 /// functions to debug a problem on, it's easy to find the smallest or simplest
72 /// function to attack.  Note that the counts may change somewhat depending on
73 /// what point it is called during the translation passes.
getFunctionNameAndSize() const74 std::string Cfg::getFunctionNameAndSize() const {
75   if (!BuildDefs::dump())
76     return getFunctionName().toString();
77   SizeT NodeCount = 0;
78   SizeT InstCount = 0;
79   for (CfgNode *Node : getNodes()) {
80     ++NodeCount;
81     // Note: deleted instructions are *not* ignored.
82     InstCount += Node->getPhis().size();
83     for (Inst &I : Node->getInsts()) {
84       if (!llvm::isa<InstTarget>(&I))
85         ++InstCount;
86     }
87   }
88   return getFunctionName() + "(i=" + std::to_string(InstCount) +
89          ":b=" + std::to_string(NodeCount) + ")";
90 }
91 
setError(const std::string & Message)92 void Cfg::setError(const std::string &Message) {
93   HasError = true;
94   ErrorMessage = Message;
95 }
96 
makeNode()97 CfgNode *Cfg::makeNode() {
98   SizeT LabelIndex = Nodes.size();
99   auto *Node = CfgNode::create(this, LabelIndex);
100   Nodes.push_back(Node);
101   return Node;
102 }
103 
swapNodes(NodeList & NewNodes)104 void Cfg::swapNodes(NodeList &NewNodes) {
105   assert(Nodes.size() == NewNodes.size());
106   Nodes.swap(NewNodes);
107   for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I)
108     Nodes[I]->resetIndex(I);
109 }
110 
makeVariable(Type Ty)111 template <> Variable *Cfg::makeVariable<Variable>(Type Ty) {
112   SizeT Index = Variables.size();
113   Variable *Var;
114   if (Target->shouldSplitToVariableVecOn32(Ty)) {
115     Var = VariableVecOn32::create(this, Ty, Index);
116   } else if (Target->shouldSplitToVariable64On32(Ty)) {
117     Var = Variable64On32::create(this, Ty, Index);
118   } else {
119     Var = Variable::create(this, Ty, Index);
120   }
121   Variables.push_back(Var);
122   return Var;
123 }
124 
addArg(Variable * Arg)125 void Cfg::addArg(Variable *Arg) {
126   Arg->setIsArg();
127   Args.push_back(Arg);
128 }
129 
addImplicitArg(Variable * Arg)130 void Cfg::addImplicitArg(Variable *Arg) {
131   Arg->setIsImplicitArg();
132   ImplicitArgs.push_back(Arg);
133 }
134 
135 // Returns whether the stack frame layout has been computed yet. This is used
136 // for dumping the stack frame location of Variables.
hasComputedFrame() const137 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }
138 
139 namespace {
140 constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$";
141 constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$";
142 } // end of anonymous namespace
143 
createNodeNameDeclaration(const std::string & NodeAsmName)144 void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) {
145   auto *Var = VariableDeclaration::create(GlobalInits.get());
146   Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName);
147   Var->setIsConstant(true);
148   Var->addInitializer(VariableDeclaration::DataInitializer::create(
149       GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1));
150   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
151   Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes.
152   GlobalInits->push_back(Var);
153 }
154 
createBlockProfilingInfoDeclaration(const std::string & NodeAsmName,VariableDeclaration * NodeNameDeclaration)155 void Cfg::createBlockProfilingInfoDeclaration(
156     const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) {
157   auto *Var = VariableDeclaration::create(GlobalInits.get());
158   Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName);
159   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
160   Var->addInitializer(VariableDeclaration::ZeroInitializer::create(
161       GlobalInits.get(), Int64ByteSize));
162 
163   const RelocOffsetT NodeNameDeclarationOffset = 0;
164   Var->addInitializer(VariableDeclaration::RelocInitializer::create(
165       GlobalInits.get(), NodeNameDeclaration,
166       {RelocOffset::create(Ctx, NodeNameDeclarationOffset)}));
167   Var->setAlignment(Int64ByteSize);
168   GlobalInits->push_back(Var);
169 }
170 
translate()171 void Cfg::translate() {
172   if (hasError())
173     return;
174   // Cache the possibly-overridden optimization level once translation begins.
175   // It would be nicer to do this in the constructor, but we need to wait until
176   // after setFunctionName() has a chance to be called.
177   OptimizationLevel =
178       getFlags().matchForceO2(getFunctionName(), getSequenceNumber())
179           ? Opt_2
180           : getFlags().getOptLevel();
181   if (BuildDefs::timers()) {
182     if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) {
183       setFocusedTiming();
184       getContext()->resetTimer(GlobalContext::TSK_Default);
185     }
186   }
187   if (BuildDefs::dump()) {
188     if (isVerbose(IceV_Status) &&
189         getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) {
190       getContext()->getStrDump()
191           << ">>>Translating " << getFunctionNameAndSize()
192           << " seq=" << getSequenceNumber() << "\n";
193     }
194   }
195   TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty());
196   TimerMarker T(TimerStack::TT_translate, this);
197 
198   dump("Initial CFG");
199 
200   // Create the Hi and Lo variables where a split was needed
201   for (Variable *Var : Variables) {
202     if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) {
203       Var64On32->initHiLo(this);
204     } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) {
205       VarVecOn32->initVecElement(this);
206     }
207   }
208 
209   // Instrument the Cfg, e.g. with AddressSanitizer
210   if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) {
211     getContext()->instrumentFunc(this);
212     dump("Instrumented CFG");
213   }
214 
215   // The set of translation passes and their order are determined by the
216   // target.
217   getTarget()->translate();
218 
219   dump("Final output");
220   if (getFocusedTiming()) {
221     getContext()->dumpLocalTimers(getFunctionName().toString());
222   }
223 }
224 
fixPhiNodes()225 void Cfg::fixPhiNodes() {
226   for (auto *Node : Nodes) {
227     // Fix all the phi edges since WASM can't tell how to make them correctly at
228     // the beginning.
229     assert(Node);
230     const auto &InEdges = Node->getInEdges();
231     for (auto &Instr : Node->getPhis()) {
232       auto *Phi = llvm::cast<InstPhi>(&Instr);
233       assert(Phi);
234       for (SizeT i = 0; i < InEdges.size(); ++i) {
235         Phi->setLabel(i, InEdges[i]);
236       }
237     }
238   }
239 }
240 
computeInOutEdges()241 void Cfg::computeInOutEdges() {
242   // Compute the out-edges.
243   for (CfgNode *Node : Nodes) {
244     Node->computeSuccessors();
245   }
246 
247   // Prune any unreachable nodes before computing in-edges.
248   SizeT NumNodes = getNumNodes();
249   BitVector Reachable(NumNodes);
250   BitVector Pending(NumNodes);
251   Pending.set(getEntryNode()->getIndex());
252   while (true) {
253     int Index = Pending.find_first();
254     if (Index == -1)
255       break;
256     Pending.reset(Index);
257     Reachable.set(Index);
258     CfgNode *Node = Nodes[Index];
259     assert(Node->getIndex() == (SizeT)Index);
260     for (CfgNode *Succ : Node->getOutEdges()) {
261       SizeT SuccIndex = Succ->getIndex();
262       if (!Reachable.test(SuccIndex))
263         Pending.set(SuccIndex);
264     }
265   }
266   SizeT Dest = 0;
267   for (SizeT Source = 0; Source < NumNodes; ++Source) {
268     if (Reachable.test(Source)) {
269       Nodes[Dest] = Nodes[Source];
270       Nodes[Dest]->resetIndex(Dest);
271       // Compute the in-edges.
272       Nodes[Dest]->computePredecessors();
273       ++Dest;
274     }
275   }
276   Nodes.resize(Dest);
277 
278   TimerMarker T(TimerStack::TT_phiValidation, this);
279   for (CfgNode *Node : Nodes)
280     Node->enforcePhiConsistency();
281 }
282 
renumberInstructions()283 void Cfg::renumberInstructions() {
284   TimerMarker T(TimerStack::TT_renumberInstructions, this);
285   NextInstNumber = Inst::NumberInitial;
286   for (CfgNode *Node : Nodes)
287     Node->renumberInstructions();
288   // Make sure the entry node is the first node and therefore got the lowest
289   // instruction numbers, to facilitate live range computation of function
290   // arguments.  We want to model function arguments as being live on entry to
291   // the function, otherwise an argument whose only use is in the first
292   // instruction will be assigned a trivial live range and the register
293   // allocator will not recognize its live range as overlapping another
294   // variable's live range.
295   assert(Nodes.empty() || (*Nodes.begin() == getEntryNode()));
296 }
297 
298 // placePhiLoads() must be called before placePhiStores().
placePhiLoads()299 void Cfg::placePhiLoads() {
300   TimerMarker T(TimerStack::TT_placePhiLoads, this);
301   for (CfgNode *Node : Nodes)
302     Node->placePhiLoads();
303 }
304 
305 // placePhiStores() must be called after placePhiLoads().
placePhiStores()306 void Cfg::placePhiStores() {
307   TimerMarker T(TimerStack::TT_placePhiStores, this);
308   for (CfgNode *Node : Nodes)
309     Node->placePhiStores();
310 }
311 
deletePhis()312 void Cfg::deletePhis() {
313   TimerMarker T(TimerStack::TT_deletePhis, this);
314   for (CfgNode *Node : Nodes)
315     Node->deletePhis();
316 }
317 
advancedPhiLowering()318 void Cfg::advancedPhiLowering() {
319   TimerMarker T(TimerStack::TT_advancedPhiLowering, this);
320   // Clear all previously computed live ranges (but not live-in/live-out bit
321   // vectors or last-use markers), because the follow-on register allocation is
322   // only concerned with live ranges across the newly created blocks.
323   for (Variable *Var : Variables) {
324     Var->getLiveRange().reset();
325   }
326   // This splits edges and appends new nodes to the end of the node list. This
327   // can invalidate iterators, so don't use an iterator.
328   SizeT NumNodes = getNumNodes();
329   SizeT NumVars = getNumVariables();
330   for (SizeT I = 0; I < NumNodes; ++I)
331     Nodes[I]->advancedPhiLowering();
332 
333   TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this);
334   if (true) {
335     // The following code does an in-place update of liveness and live ranges
336     // as a result of adding the new phi edge split nodes.
337     getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes,
338                                      Variables.begin() + NumVars);
339     TimerMarker TTT(TimerStack::TT_liveness, this);
340     // Iterate over the newly added nodes to add their liveness info.
341     for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) {
342       InstNumberT FirstInstNum = getNextInstNumber();
343       (*I)->renumberInstructions();
344       InstNumberT LastInstNum = getNextInstNumber() - 1;
345       (*I)->liveness(getLiveness());
346       (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
347     }
348   } else {
349     // The following code does a brute-force recalculation of live ranges as a
350     // result of adding the new phi edge split nodes. The liveness calculation
351     // is particularly expensive because the new nodes are not yet in a proper
352     // topological order and so convergence is slower.
353     //
354     // This code is kept here for reference and can be temporarily enabled in
355     // case the incremental code is under suspicion.
356     renumberInstructions();
357     liveness(Liveness_Intervals);
358     getVMetadata()->init(VMK_All);
359   }
360   Target->regAlloc(RAK_Phi);
361 }
362 
363 // Find a reasonable placement for nodes that have not yet been placed, while
364 // maintaining the same relative ordering among already placed nodes.
reorderNodes()365 void Cfg::reorderNodes() {
366   // TODO(ascull): it would be nice if the switch tests were always followed by
367   // the default case to allow for fall through.
368   using PlacedList = CfgList<CfgNode *>;
369   PlacedList Placed;      // Nodes with relative placement locked down
370   PlacedList Unreachable; // Unreachable nodes
371   PlacedList::iterator NoPlace = Placed.end();
372   // Keep track of where each node has been tentatively placed so that we can
373   // manage insertions into the middle.
374   CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
375   for (CfgNode *Node : Nodes) {
376     // The "do ... while(0);" construct is to factor out the --PlaceIndex and
377     // assert() statements before moving to the next node.
378     do {
379       if (Node != getEntryNode() && Node->getInEdges().empty()) {
380         // The node has essentially been deleted since it is not a successor of
381         // any other node.
382         Unreachable.push_back(Node);
383         PlaceIndex[Node->getIndex()] = Unreachable.end();
384         Node->setNeedsPlacement(false);
385         continue;
386       }
387       if (!Node->needsPlacement()) {
388         // Add to the end of the Placed list.
389         Placed.push_back(Node);
390         PlaceIndex[Node->getIndex()] = Placed.end();
391         continue;
392       }
393       Node->setNeedsPlacement(false);
394       // Assume for now that the unplaced node is from edge-splitting and
395       // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1
396       // in-edge if the predecessor node was contracted). If this changes in
397       // the future, rethink the strategy.
398       assert(Node->getInEdges().size() >= 1);
399       assert(Node->hasSingleOutEdge());
400 
401       // If it's a (non-critical) edge where the successor has a single
402       // in-edge, then place it before the successor.
403       CfgNode *Succ = Node->getOutEdges().front();
404       if (Succ->getInEdges().size() == 1 &&
405           PlaceIndex[Succ->getIndex()] != NoPlace) {
406         Placed.insert(PlaceIndex[Succ->getIndex()], Node);
407         PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()];
408         continue;
409       }
410 
411       // Otherwise, place it after the (first) predecessor.
412       CfgNode *Pred = Node->getInEdges().front();
413       auto PredPosition = PlaceIndex[Pred->getIndex()];
414       // It shouldn't be the case that PredPosition==NoPlace, but if that
415       // somehow turns out to be true, we just insert Node before
416       // PredPosition=NoPlace=Placed.end() .
417       if (PredPosition != NoPlace)
418         ++PredPosition;
419       Placed.insert(PredPosition, Node);
420       PlaceIndex[Node->getIndex()] = PredPosition;
421     } while (0);
422 
423     --PlaceIndex[Node->getIndex()];
424     assert(*PlaceIndex[Node->getIndex()] == Node);
425   }
426 
427   // Reorder Nodes according to the built-up lists.
428   NodeList Reordered;
429   Reordered.reserve(Placed.size() + Unreachable.size());
430   for (CfgNode *Node : Placed)
431     Reordered.push_back(Node);
432   for (CfgNode *Node : Unreachable)
433     Reordered.push_back(Node);
434   assert(getNumNodes() == Reordered.size());
435   swapNodes(Reordered);
436 }
437 
localCSE(bool AssumeSSA)438 void Cfg::localCSE(bool AssumeSSA) {
439   // Performs basic-block local common-subexpression elimination
440   // If we have
441   // t1 = op b c
442   // t2 = op b c
443   // This pass will replace future references to t2 in a basic block by t1
444   // Points to note:
445   // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
446   //      This is needed if this pass is moved to a point later in the pipeline.
447   //      If variables have a single definition (in the node), CSE can work just
448   //      on the basis of an equality compare on instructions (sans Dest). When
449   //      variables can be updated (hence, non-SSA) the result of a previous
450   //      instruction which used that variable as an operand can not be reused.
451   // 2. Leaves removal of instructions to DCE.
452   // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
453   //    to take care of cases not arising from GEP simplification.
454   // 4. By default, a single pass is made over each basic block. Control this
455   //    with -lcse-max-iters=N
456 
457   TimerMarker T(TimerStack::TT_localCse, this);
458   struct VariableHash {
459     size_t operator()(const Variable *Var) const { return Var->hashValue(); }
460   };
461 
462   struct InstHash {
463     size_t operator()(const Inst *Instr) const {
464       auto Kind = Instr->getKind();
465       auto Result =
466           std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
467               Kind);
468       for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
469         Result ^= Instr->getSrc(i)->hashValue();
470       }
471       return Result;
472     }
473   };
474   struct InstEq {
475     bool srcEq(const Operand *A, const Operand *B) const {
476       if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
477         return (A == B);
478       return false;
479     }
480     bool operator()(const Inst *InstrA, const Inst *InstrB) const {
481       if ((InstrA->getKind() != InstrB->getKind()) ||
482           (InstrA->getSrcSize() != InstrB->getSrcSize()))
483         return false;
484 
485       if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
486         auto *B = llvm::cast<InstArithmetic>(InstrB);
487         // A, B are guaranteed to be of the same 'kind' at this point
488         // So, dyn_cast is not needed
489         if (A->getOp() != B->getOp())
490           return false;
491       }
492       // Does not enter loop if different kind or number of operands
493       for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
494         if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
495           return false;
496       }
497       return true;
498     }
499   };
500 
501   for (CfgNode *Node : getNodes()) {
502     CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
503     // Stores currently available instructions.
504 
505     CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
506     // Combining the above two into a single data structure might consume less
507     // memory but will be slower i.e map of Instruction -> Set of Variables
508 
509     CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
510     // Maps a variable to the Instructions that depend on it.
511     // a = op1 b c
512     // x = op2 c d
513     // Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
514     // Not necessary for SSA as dependencies will never be invalidated, and the
515     // container will use minimal memory when left unused.
516 
517     auto IterCount = getFlags().getLocalCseMaxIterations();
518 
519     for (uint32_t i = 0; i < IterCount; ++i) {
520       // TODO(manasijm): Stats on IterCount -> performance
521       for (Inst &Instr : Node->getInsts()) {
522         if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
523           continue;
524         if (!AssumeSSA) {
525           // Invalidate replacements
526           auto Iter = Replacements.find(Instr.getDest());
527           if (Iter != Replacements.end()) {
528             Replacements.erase(Iter);
529           }
530 
531           // Invalidate 'seen' instructions whose operands were just updated.
532           auto DepIter = Dependency.find(Instr.getDest());
533           if (DepIter != Dependency.end()) {
534             for (auto *DepInst : DepIter->second) {
535               Seen.erase(DepInst);
536             }
537           }
538         }
539 
540         // Replace - doing this before checking for repetitions might enable
541         // more optimizations
542         for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
543           auto *Opnd = Instr.getSrc(i);
544           if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
545             if (Replacements.find(Var) != Replacements.end()) {
546               Instr.replaceSource(i, Replacements[Var]);
547             }
548           }
549         }
550 
551         // Check for repetitions
552         auto SeenIter = Seen.find(&Instr);
553         if (SeenIter != Seen.end()) { // seen before
554           const Inst *Found = *SeenIter;
555           Replacements[Instr.getDest()] = Found->getDest();
556         } else { // new
557           Seen.insert(&Instr);
558 
559           if (!AssumeSSA) {
560             // Update dependencies
561             for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
562               auto *Opnd = Instr.getSrc(i);
563               if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
564                 Dependency[Var].push_back(&Instr);
565               }
566             }
567           }
568         }
569       }
570     }
571   }
572 }
573 
loopInvariantCodeMotion()574 void Cfg::loopInvariantCodeMotion() {
575   TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
576   // Does not introduce new nodes as of now.
577   for (auto &Loop : LoopInfo) {
578     CfgNode *Header = Loop.Header;
579     assert(Header);
580     if (Header->getLoopNestDepth() < 1)
581       return;
582     CfgNode *PreHeader = Loop.PreHeader;
583     if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
584       return; // try next loop
585     }
586 
587     auto &Insts = PreHeader->getInsts();
588     auto &LastInst = Insts.back();
589     Insts.pop_back();
590 
591     for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) {
592       PreHeader->appendInst(Inst);
593     }
594     PreHeader->appendInst(&LastInst);
595   }
596 }
597 
598 CfgVector<Inst *>
findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> & Body)599 Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) {
600   CfgUnorderedSet<Inst *> InvariantInsts;
601   CfgUnorderedSet<Variable *> InvariantVars;
602   for (auto *Var : getArgs()) {
603     InvariantVars.insert(Var);
604   }
605   bool Changed = false;
606   do {
607     Changed = false;
608     for (auto NodeIndex : Body) {
609       auto *Node = Nodes[NodeIndex];
610       CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
611                                                     Node->getInsts().end());
612 
613       for (auto &InstRef : Insts) {
614         auto &Inst = InstRef.get();
615         if (Inst.isDeleted() ||
616             InvariantInsts.find(&Inst) != InvariantInsts.end())
617           continue;
618         switch (Inst.getKind()) {
619         case Inst::InstKind::Alloca:
620         case Inst::InstKind::Br:
621         case Inst::InstKind::Ret:
622         case Inst::InstKind::Phi:
623         case Inst::InstKind::Call:
624         case Inst::InstKind::Intrinsic:
625         case Inst::InstKind::Load:
626         case Inst::InstKind::Store:
627         case Inst::InstKind::Switch:
628           continue;
629         default:
630           break;
631         }
632 
633         bool IsInvariant = true;
634         for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
635           if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
636             if (InvariantVars.find(Var) == InvariantVars.end()) {
637               IsInvariant = false;
638             }
639           }
640         }
641         if (IsInvariant) {
642           Changed = true;
643           InvariantInsts.insert(&Inst);
644           Node->getInsts().remove(Inst);
645           if (Inst.getDest() != nullptr) {
646             InvariantVars.insert(Inst.getDest());
647           }
648         }
649       }
650     }
651   } while (Changed);
652 
653   CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
654   std::sort(InstVector.begin(), InstVector.end(),
655             [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
656   return InstVector;
657 }
658 
shortCircuitJumps()659 void Cfg::shortCircuitJumps() {
660   // Split Nodes whenever an early jump is possible.
661   // __N :
662   //   a = <something>
663   //   Instruction 1 without side effect
664   //   ... b = <something> ...
665   //   Instruction N without side effect
666   //   t1 = or a b
667   //   br t1 __X __Y
668   //
669   // is transformed into:
670   // __N :
671   //   a = <something>
672   //   br a __X __N_ext
673   //
674   // __N_ext :
675   //   Instruction 1 without side effect
676   //   ... b = <something> ...
677   //   Instruction N without side effect
678   //   br b __X __Y
679   // (Similar logic for AND, jump to false instead of true target.)
680 
681   TimerMarker T(TimerStack::TT_shortCircuit, this);
682   getVMetadata()->init(VMK_Uses);
683   auto NodeStack = this->getNodes();
684   CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits;
685   while (!NodeStack.empty()) {
686     auto *Node = NodeStack.back();
687     NodeStack.pop_back();
688     auto NewNode = Node->shortCircuit();
689     if (NewNode) {
690       NodeStack.push_back(NewNode);
691       NodeStack.push_back(Node);
692       Splits[Node->getIndex()].push_back(NewNode);
693     }
694   }
695 
696   // Insert nodes in the right place
697   NodeList NewList;
698   NewList.reserve(Nodes.size());
699   CfgUnorderedSet<SizeT> Inserted;
700   for (auto *Node : Nodes) {
701     if (Inserted.find(Node->getIndex()) != Inserted.end())
702       continue; // already inserted
703     NodeList Stack{Node};
704     while (!Stack.empty()) {
705       auto *Current = Stack.back();
706       Stack.pop_back();
707       Inserted.insert(Current->getIndex());
708       NewList.push_back(Current);
709       for (auto *Next : Splits[Current->getIndex()]) {
710         Stack.push_back(Next);
711       }
712     }
713   }
714 
715   SizeT NodeIndex = 0;
716   for (auto *Node : NewList) {
717     Node->resetIndex(NodeIndex++);
718   }
719   Nodes = NewList;
720 }
721 
floatConstantCSE()722 void Cfg::floatConstantCSE() {
723   // Load multiple uses of a floating point constant (between two call
724   // instructions or block start/end) into a variable before its first use.
725   //   t1 = b + 1.0
726   //   t2 = c + 1.0
727   // Gets transformed to:
728   //   t0 = 1.0
729   //   t0_1 = t0
730   //   t1 = b + t0_1
731   //   t2 = c + t0_1
732   // Call instructions reset the procedure, but use the same variable, just in
733   // case it got a register. We are assuming floating point registers are not
734   // callee saved in general. Example, continuing from before:
735   //   result = call <some function>
736   //   t3 = d + 1.0
737   // Gets transformed to:
738   //   result = call <some function>
739   //   t0_2 = t0
740   //   t3 = d + t0_2
741   // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of
742   // 1.0. When t0 does not get a register, introducing an extra assignment
743   // statement does not make sense. The relevant portion is marked below.
744 
745   TimerMarker _(TimerStack::TT_floatConstantCse, this);
746   for (CfgNode *Node : getNodes()) {
747 
748     CfgUnorderedMap<Constant *, Variable *> ConstCache;
749     auto Current = Node->getInsts().begin();
750     auto End = Node->getInsts().end();
751     while (Current != End) {
752       CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses;
753       if (llvm::isa<InstCall>(iteratorToInst(Current))) {
754         ++Current;
755         assert(Current != End);
756         // Block should not end with a call
757       }
758       while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) {
759         if (!Current->isDeleted()) {
760           for (SizeT i = 0; i < Current->getSrcSize(); ++i) {
761             if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) {
762               if (Const->getType() == IceType_f32 ||
763                   Const->getType() == IceType_f64) {
764                 FloatUses[Const].push_back(Current);
765               }
766             }
767           }
768         }
769         ++Current;
770       }
771       for (auto &Pair : FloatUses) {
772         static constexpr SizeT MinUseThreshold = 3;
773         if (Pair.second.size() < MinUseThreshold)
774           continue;
775         // Only consider constants with at least `MinUseThreshold` uses
776         auto &Insts = Node->getInsts();
777 
778         if (ConstCache.find(Pair.first) == ConstCache.end()) {
779           // Saw a constant (which is used at least twice) for the first time
780           auto *NewVar = makeVariable(Pair.first->getType());
781           // NewVar->setLinkedTo(Pair.first);
782           // TODO(manasijm): Plumbing for linking to an Operand.
783           auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first);
784           Insts.insert(Pair.second[0], Assign);
785           ConstCache[Pair.first] = NewVar;
786         }
787 
788         auto *NewVar = makeVariable(Pair.first->getType());
789         NewVar->setLinkedTo(ConstCache[Pair.first]);
790         auto *Assign =
791             InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]);
792 
793         Insts.insert(Pair.second[0], Assign);
794         for (auto InstUse : Pair.second) {
795           for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) {
796             if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) {
797               if (Const == Pair.first) {
798                 InstUse->replaceSource(i, NewVar);
799               }
800             }
801           }
802         }
803       }
804     }
805   }
806 }
807 
doArgLowering()808 void Cfg::doArgLowering() {
809   TimerMarker T(TimerStack::TT_doArgLowering, this);
810   getTarget()->lowerArguments();
811 }
812 
sortAndCombineAllocas(CfgVector<InstAlloca * > & Allocas,uint32_t CombinedAlignment,InstList & Insts,AllocaBaseVariableType BaseVariableType)813 void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
814                                 uint32_t CombinedAlignment, InstList &Insts,
815                                 AllocaBaseVariableType BaseVariableType) {
816   if (Allocas.empty())
817     return;
818   // Sort by decreasing alignment.
819   std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) {
820     uint32_t Align1 = A1->getAlignInBytes();
821     uint32_t Align2 = A2->getAlignInBytes();
822     if (Align1 == Align2)
823       return A1->getNumber() < A2->getNumber();
824     else
825       return Align1 > Align2;
826   });
827   // Process the allocas in order of decreasing stack alignment.  This allows
828   // us to pack less-aligned pieces after more-aligned ones, resulting in less
829   // stack growth.  It also allows there to be at most one stack alignment "and"
830   // instruction for a whole list of allocas.
831   uint32_t CurrentOffset = 0;
832   CfgVector<int32_t> Offsets;
833   for (Inst *Instr : Allocas) {
834     auto *Alloca = llvm::cast<InstAlloca>(Instr);
835     // Adjust the size of the allocation up to the next multiple of the
836     // object's alignment.
837     uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u);
838     auto *ConstSize =
839         llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes());
840     uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment);
841     if (BaseVariableType == BVT_FramePointer) {
842       // Addressing is relative to the frame pointer.  Subtract the offset after
843       // adding the size of the alloca, because it grows downwards from the
844       // frame pointer.
845       Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size));
846     } else {
847       // Addressing is relative to the stack pointer or to a user pointer.  Add
848       // the offset before adding the size of the object, because it grows
849       // upwards from the stack pointer. In addition, if the addressing is
850       // relative to the stack pointer, we need to add the pre-computed max out
851       // args size bytes.
852       const uint32_t OutArgsOffsetOrZero =
853           (BaseVariableType == BVT_StackPointer)
854               ? getTarget()->maxOutArgsSizeBytes()
855               : 0;
856       Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
857     }
858     // Update the running offset of the fused alloca region.
859     CurrentOffset += Size;
860   }
861   // Round the offset up to the alignment granularity to use as the size.
862   uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment);
863   // Ensure every alloca was assigned an offset.
864   assert(Allocas.size() == Offsets.size());
865 
866   switch (BaseVariableType) {
867   case BVT_UserPointer: {
868     Variable *BaseVariable = makeVariable(IceType_i32);
869     for (SizeT i = 0; i < Allocas.size(); ++i) {
870       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
871       // Emit a new addition operation to replace the alloca.
872       Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]);
873       InstArithmetic *Add =
874           InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(),
875                                  BaseVariable, AllocaOffset);
876       Insts.push_front(Add);
877       Alloca->setDeleted();
878     }
879     Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
880     InstAlloca *CombinedAlloca =
881         InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment);
882     CombinedAlloca->setKnownFrameOffset();
883     Insts.push_front(CombinedAlloca);
884   } break;
885   case BVT_StackPointer:
886   case BVT_FramePointer: {
887     for (SizeT i = 0; i < Allocas.size(); ++i) {
888       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
889       // Emit a fake definition of the rematerializable variable.
890       Variable *Dest = Alloca->getDest();
891       auto *Def = InstFakeDef::create(this, Dest);
892       if (BaseVariableType == BVT_StackPointer)
893         Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
894       else
895         Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]);
896       Insts.push_front(Def);
897       Alloca->setDeleted();
898     }
899     // Allocate the fixed area in the function prolog.
900     getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment);
901   } break;
902   }
903 }
904 
processAllocas(bool SortAndCombine)905 void Cfg::processAllocas(bool SortAndCombine) {
906   TimerMarker _(TimerStack::TT_alloca, this);
907   const uint32_t StackAlignment = getTarget()->getStackAlignment();
908   CfgNode *EntryNode = getEntryNode();
909   assert(EntryNode);
910   // LLVM enforces power of 2 alignment.
911   assert(llvm::isPowerOf2_32(StackAlignment));
912   // If the ABI's stack alignment is smaller than the vector size (16 bytes),
913   // conservatively use a frame pointer to allow for explicit alignment of the
914   // stack pointer. This needs to happen before register allocation so the frame
915   // pointer can be reserved.
916   if (getTarget()->needsStackPointerAlignment()) {
917     getTarget()->setHasFramePointer();
918   }
919   // Determine if there are large alignment allocations in the entry block or
920   // dynamic allocations (variable size in the entry block).
921   bool HasLargeAlignment = false;
922   bool HasDynamicAllocation = false;
923   for (Inst &Instr : EntryNode->getInsts()) {
924     if (Instr.isDeleted())
925       continue;
926     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
927       uint32_t AlignmentParam = Alloca->getAlignInBytes();
928       if (AlignmentParam > StackAlignment)
929         HasLargeAlignment = true;
930       if (llvm::isa<Constant>(Alloca->getSizeInBytes()))
931         Alloca->setKnownFrameOffset();
932       else {
933         HasDynamicAllocation = true;
934         // If Allocas are not sorted, the first dynamic allocation causes
935         // later Allocas to be at unknown offsets relative to the stack/frame.
936         if (!SortAndCombine)
937           break;
938       }
939     }
940   }
941   // Don't do the heavyweight sorting and layout for low optimization levels.
942   if (!SortAndCombine)
943     return;
944   // Any alloca outside the entry block is a dynamic allocation.
945   for (CfgNode *Node : Nodes) {
946     if (Node == EntryNode)
947       continue;
948     for (Inst &Instr : Node->getInsts()) {
949       if (Instr.isDeleted())
950         continue;
951       if (llvm::isa<InstAlloca>(&Instr)) {
952         // Allocations outside the entry block require a frame pointer.
953         HasDynamicAllocation = true;
954         break;
955       }
956     }
957     if (HasDynamicAllocation)
958       break;
959   }
960   // Mark the target as requiring a frame pointer.
961   if (HasLargeAlignment || HasDynamicAllocation)
962     getTarget()->setHasFramePointer();
963   // Collect the Allocas into the two vectors.
964   // Allocas in the entry block that have constant size and alignment less
965   // than or equal to the function's stack alignment.
966   CfgVector<InstAlloca *> FixedAllocas;
967   // Allocas in the entry block that have constant size and alignment greater
968   // than the function's stack alignment.
969   CfgVector<InstAlloca *> AlignedAllocas;
970   // Maximum alignment used by any alloca.
971   uint32_t MaxAlignment = StackAlignment;
972   for (Inst &Instr : EntryNode->getInsts()) {
973     if (Instr.isDeleted())
974       continue;
975     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
976       if (!llvm::isa<Constant>(Alloca->getSizeInBytes()))
977         continue;
978       uint32_t AlignmentParam = Alloca->getAlignInBytes();
979       // For default align=0, set it to the real value 1, to avoid any
980       // bit-manipulation problems below.
981       AlignmentParam = std::max(AlignmentParam, 1u);
982       assert(llvm::isPowerOf2_32(AlignmentParam));
983       if (HasDynamicAllocation && AlignmentParam > StackAlignment) {
984         // If we have both dynamic allocations and large stack alignments,
985         // high-alignment allocations are pulled out with their own base.
986         AlignedAllocas.push_back(Alloca);
987       } else {
988         FixedAllocas.push_back(Alloca);
989       }
990       MaxAlignment = std::max(AlignmentParam, MaxAlignment);
991     }
992   }
993   // Add instructions to the head of the entry block in reverse order.
994   InstList &Insts = getEntryNode()->getInsts();
995   if (HasDynamicAllocation && HasLargeAlignment) {
996     // We are using a frame pointer, but fixed large-alignment alloca addresses
997     // do not have a known offset from either the stack or frame pointer.
998     // They grow up from a user pointer from an alloca.
999     sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
1000     // Fixed size allocas are addressed relative to the frame pointer.
1001     sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts,
1002                           BVT_FramePointer);
1003   } else {
1004     // Otherwise, fixed size allocas are addressed relative to the stack unless
1005     // there are dynamic allocas.
1006     const AllocaBaseVariableType BasePointerType =
1007         (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer);
1008     sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
1009   }
1010   if (!FixedAllocas.empty() || !AlignedAllocas.empty())
1011     // No use calling findRematerializable() unless there is some
1012     // rematerializable alloca instruction to seed it.
1013     findRematerializable();
1014 }
1015 
1016 namespace {
1017 
1018 // Helpers for findRematerializable().  For each of them, if a suitable
1019 // rematerialization is found, the instruction's Dest variable is set to be
1020 // rematerializable and it returns true, otherwise it returns false.
1021 
rematerializeArithmetic(const Inst * Instr)1022 bool rematerializeArithmetic(const Inst *Instr) {
1023   // Check that it's an Arithmetic instruction with an Add operation.
1024   auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr);
1025   if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add)
1026     return false;
1027   // Check that Src(0) is rematerializable.
1028   auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0));
1029   if (Src0Var == nullptr || !Src0Var->isRematerializable())
1030     return false;
1031   // Check that Src(1) is an immediate.
1032   auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1));
1033   if (Src1Imm == nullptr)
1034     return false;
1035   Arith->getDest()->setRematerializable(
1036       Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue());
1037   return true;
1038 }
1039 
rematerializeAssign(const Inst * Instr)1040 bool rematerializeAssign(const Inst *Instr) {
1041   // An InstAssign only originates from an inttoptr or ptrtoint instruction,
1042   // which never occurs in a MINIMAL build.
1043   if (BuildDefs::minimal())
1044     return false;
1045   // Check that it's an Assign instruction.
1046   if (!llvm::isa<InstAssign>(Instr))
1047     return false;
1048   // Check that Src(0) is rematerializable.
1049   auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0));
1050   if (Src0Var == nullptr || !Src0Var->isRematerializable())
1051     return false;
1052   Instr->getDest()->setRematerializable(Src0Var->getRegNum(),
1053                                         Src0Var->getStackOffset());
1054   return true;
1055 }
1056 
rematerializeCast(const Inst * Instr)1057 bool rematerializeCast(const Inst *Instr) {
1058   // An pointer-type bitcast never occurs in a MINIMAL build.
1059   if (BuildDefs::minimal())
1060     return false;
1061   // Check that it's a Cast instruction with a Bitcast operation.
1062   auto *Cast = llvm::dyn_cast<InstCast>(Instr);
1063   if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast)
1064     return false;
1065   // Check that Src(0) is rematerializable.
1066   auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0));
1067   if (Src0Var == nullptr || !Src0Var->isRematerializable())
1068     return false;
1069   // Check that Dest and Src(0) have the same type.
1070   Variable *Dest = Cast->getDest();
1071   if (Dest->getType() != Src0Var->getType())
1072     return false;
1073   Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset());
1074   return true;
1075 }
1076 
1077 } // end of anonymous namespace
1078 
1079 /// Scan the function to find additional rematerializable variables.  This is
1080 /// possible when the source operand of an InstAssignment is a rematerializable
1081 /// variable, or the same for a pointer-type InstCast::Bitcast, or when an
1082 /// InstArithmetic is an add of a rematerializable variable and an immediate.
1083 /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast
1084 /// instructions generally only come about from the IceConverter's treatment of
1085 /// inttoptr, ptrtoint, and bitcast instructions.  TODO(stichnot): Consider
1086 /// other possibilities, however unlikely, such as InstArithmetic::Sub, or
1087 /// commutativity.
findRematerializable()1088 void Cfg::findRematerializable() {
1089   // Scan the instructions in order, and repeat until no new opportunities are
1090   // found.  It may take more than one iteration because a variable's defining
1091   // block may happen to come after a block where it is used, depending on the
1092   // CfgNode linearization order.
1093   bool FoundNewAssignment;
1094   do {
1095     FoundNewAssignment = false;
1096     for (CfgNode *Node : getNodes()) {
1097       // No need to process Phi instructions.
1098       for (Inst &Instr : Node->getInsts()) {
1099         if (Instr.isDeleted())
1100           continue;
1101         Variable *Dest = Instr.getDest();
1102         if (Dest == nullptr || Dest->isRematerializable())
1103           continue;
1104         if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) ||
1105             rematerializeCast(&Instr)) {
1106           FoundNewAssignment = true;
1107         }
1108       }
1109     }
1110   } while (FoundNewAssignment);
1111 }
1112 
doAddressOpt()1113 void Cfg::doAddressOpt() {
1114   TimerMarker T(TimerStack::TT_doAddressOpt, this);
1115   for (CfgNode *Node : Nodes)
1116     Node->doAddressOpt();
1117 }
1118 
1119 namespace {
1120 // ShuffleVectorUtils implements helper functions for rematerializing
1121 // shufflevector instructions from a sequence of extractelement/insertelement
1122 // instructions. It looks for the following pattern:
1123 //
1124 // %t0 = extractelement A, %n0
1125 // %t1 = extractelement B, %n1
1126 // %t2 = extractelement C, %n2
1127 // ...
1128 // %tN = extractelement N, %nN
1129 // %d0 = insertelement undef, %t0, 0
1130 // %d1 = insertelement %d0, %t1, 1
1131 // %d2 = insertelement %d1, %t2, 2
1132 // ...
1133 // %dest = insertelement %d_N-1, %tN, N
1134 //
1135 // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
1136 // distinct variables.
1137 namespace ShuffleVectorUtils {
1138 // findAllInserts is used when searching for all the insertelements that are
1139 // used in a shufflevector operation. This function works recursively, when
1140 // invoked with I = i, the function assumes Insts[i] is the last found
1141 // insertelement in the chain. The next insertelement insertruction is saved in
1142 // Insts[i+1].
findAllInserts(Cfg * Func,GlobalContext * Ctx,VariablesMetadata * VM,CfgVector<const Inst * > * Insts,SizeT I=0)1143 bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
1144                     CfgVector<const Inst *> *Insts, SizeT I = 0) {
1145   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
1146 
1147   if (I > Insts->size()) {
1148     if (Verbose) {
1149       Ctx->getStrDump() << "\tToo many inserts.\n";
1150     }
1151     return false;
1152   }
1153 
1154   const auto *LastInsert = Insts->at(I);
1155   assert(llvm::isa<InstInsertElement>(LastInsert));
1156 
1157   if (I == Insts->size() - 1) {
1158     // Matching against undef is not really needed because the value in Src(0)
1159     // will be totally overwritten. We still enforce it anyways because the
1160     // PNaCl toolchain generates the bitcode with it.
1161     if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
1162       if (Verbose) {
1163         Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
1164                           << Insts->size();
1165         LastInsert->dump(Func);
1166         Ctx->getStrDump() << "\n";
1167       }
1168       return false;
1169     }
1170 
1171     // The following loop ensures that the insertelements are sorted. In theory,
1172     // we could relax this restriction and allow any order. As long as each
1173     // index appears exactly once, this chain is still a candidate for becoming
1174     // a shufflevector. The Insts vector is traversed backwards because the
1175     // instructions are "enqueued" in reverse order.
1176     int32_t ExpectedElement = 0;
1177     for (const auto *I : reverse_range(*Insts)) {
1178       if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
1179           ExpectedElement) {
1180         return false;
1181       }
1182       ++ExpectedElement;
1183     }
1184     return true;
1185   }
1186 
1187   const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
1188   const auto *Def = VM->getSingleDefinition(Src0V);
1189 
1190   // Only optimize if the first operand in
1191   //
1192   //   Dest = insertelement A, B, 10
1193   //
1194   // is singly-def'ed.
1195   if (Def == nullptr) {
1196     if (Verbose) {
1197       Ctx->getStrDump() << "\tmulti-def: ";
1198       (*Insts)[I]->dump(Func);
1199       Ctx->getStrDump() << "\n";
1200     }
1201     return false;
1202   }
1203 
1204   // We also require the (single) definition to come from an insertelement
1205   // instruction.
1206   if (!llvm::isa<InstInsertElement>(Def)) {
1207     if (Verbose) {
1208       Ctx->getStrDump() << "\tnot insert element: ";
1209       Def->dump(Func);
1210       Ctx->getStrDump() << "\n";
1211     }
1212     return false;
1213   }
1214 
1215   // Everything seems fine, so we save Def in Insts, and delegate the decision
1216   // to findAllInserts.
1217   (*Insts)[I + 1] = Def;
1218 
1219   return findAllInserts(Func, Ctx, VM, Insts, I + 1);
1220 }
1221 
1222 // insertsLastElement returns true if Insert is inserting an element in the last
1223 // position of a vector.
insertsLastElement(const Inst & Insert)1224 bool insertsLastElement(const Inst &Insert) {
1225   const Type DestTy = Insert.getDest()->getType();
1226   assert(isVectorType(DestTy));
1227   const SizeT Elem =
1228       llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
1229   return Elem == typeNumElements(DestTy) - 1;
1230 }
1231 
1232 // findAllExtracts goes over all the insertelement instructions that are
1233 // candidates to be replaced by a shufflevector, and searches for all the
1234 // definitions of the elements being inserted. If all of the elements are the
1235 // result of an extractelement instruction, and all of the extractelements
1236 // operate on at most two different sources, than the instructions can be
1237 // replaced by a shufflevector.
findAllExtracts(Cfg * Func,GlobalContext * Ctx,VariablesMetadata * VM,const CfgVector<const Inst * > & Insts,Variable ** Src0,Variable ** Src1,CfgVector<const Inst * > * Extracts)1238 bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
1239                      const CfgVector<const Inst *> &Insts, Variable **Src0,
1240                      Variable **Src1, CfgVector<const Inst *> *Extracts) {
1241   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
1242 
1243   *Src0 = nullptr;
1244   *Src1 = nullptr;
1245   assert(Insts.size() > 0);
1246   for (SizeT I = 0; I < Insts.size(); ++I) {
1247     const auto *Insert = Insts.at(I);
1248     const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
1249     if (Src1V == nullptr) {
1250       if (Verbose) {
1251         Ctx->getStrDump() << "src(1) is not a variable: ";
1252         Insert->dump(Func);
1253         Ctx->getStrDump() << "\n";
1254       }
1255       return false;
1256     }
1257 
1258     const auto *Def = VM->getSingleDefinition(Src1V);
1259     if (Def == nullptr) {
1260       if (Verbose) {
1261         Ctx->getStrDump() << "multi-def src(1): ";
1262         Insert->dump(Func);
1263         Ctx->getStrDump() << "\n";
1264       }
1265       return false;
1266     }
1267 
1268     if (!llvm::isa<InstExtractElement>(Def)) {
1269       if (Verbose) {
1270         Ctx->getStrDump() << "not extractelement: ";
1271         Def->dump(Func);
1272         Ctx->getStrDump() << "\n";
1273       }
1274       return false;
1275     }
1276 
1277     auto *Src = llvm::cast<Variable>(Def->getSrc(0));
1278     if (*Src0 == nullptr) {
1279       // No sources yet. Save Src to Src0.
1280       *Src0 = Src;
1281     } else if (*Src1 == nullptr) {
1282       // We already have a source, so we might save Src in Src1 -- but only if
1283       // Src0 is not Src.
1284       if (*Src0 != Src) {
1285         *Src1 = Src;
1286       }
1287     } else if (Src != *Src0 && Src != *Src1) {
1288       // More than two sources, so we can't rematerialize the shufflevector
1289       // instruction.
1290       if (Verbose) {
1291         Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
1292       }
1293       return false;
1294     }
1295 
1296     (*Extracts)[I] = Def;
1297   }
1298 
1299   // We should have seen at least one source operand.
1300   assert(*Src0 != nullptr);
1301 
1302   // If a second source was not seen, then we just make Src1 = Src0 to simplify
1303   // things down stream. This should not matter, as all of the indexes in the
1304   // shufflevector instruction will point to Src0.
1305   if (*Src1 == nullptr) {
1306     *Src1 = *Src0;
1307   }
1308 
1309   return true;
1310 }
1311 
1312 } // end of namespace ShuffleVectorUtils
1313 } // end of anonymous namespace
1314 
materializeVectorShuffles()1315 void Cfg::materializeVectorShuffles() {
1316   const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);
1317 
1318   std::unique_ptr<OstreamLocker> L;
1319   if (Verbose) {
1320     L.reset(new OstreamLocker(getContext()));
1321     getContext()->getStrDump() << "\nShuffle materialization:\n";
1322   }
1323 
1324   // MaxVectorElements is the maximum number of elements in the vector types
1325   // handled by Subzero. We use it to create the Inserts and Extracts vectors
1326   // with the appropriate size, thus avoiding resize() calls.
1327   const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
1328   CfgVector<const Inst *> Inserts(MaxVectorElements);
1329   CfgVector<const Inst *> Extracts(MaxVectorElements);
1330 
1331   TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
1332   for (CfgNode *Node : Nodes) {
1333     for (auto &Instr : Node->getInsts()) {
1334       if (!llvm::isa<InstInsertElement>(Instr)) {
1335         continue;
1336       }
1337       if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
1338         // To avoid wasting time, we only start the pattern match at the last
1339         // insertelement instruction -- and go backwards from there.
1340         continue;
1341       }
1342       if (Verbose) {
1343         getContext()->getStrDump() << "\tCandidate: ";
1344         Instr.dump(this);
1345         getContext()->getStrDump() << "\n";
1346       }
1347       Inserts.resize(typeNumElements(Instr.getDest()->getType()));
1348       Inserts[0] = &Instr;
1349       if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
1350                                               VMetadata.get(), &Inserts)) {
1351         // If we fail to find a sequence of insertelements, we stop the
1352         // optimization.
1353         if (Verbose) {
1354           getContext()->getStrDump() << "\tFalse alarm.\n";
1355         }
1356         continue;
1357       }
1358       if (Verbose) {
1359         getContext()->getStrDump() << "\tFound the following insertelement: \n";
1360         for (auto *I : reverse_range(Inserts)) {
1361           getContext()->getStrDump() << "\t\t";
1362           I->dump(this);
1363           getContext()->getStrDump() << "\n";
1364         }
1365       }
1366       Extracts.resize(Inserts.size());
1367       Variable *Src0;
1368       Variable *Src1;
1369       if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
1370                                                VMetadata.get(), Inserts, &Src0,
1371                                                &Src1, &Extracts)) {
1372         // If we fail to match the definitions of the insertelements' sources
1373         // with extractelement instructions -- or if those instructions operate
1374         // on more than two different variables -- we stop the optimization.
1375         if (Verbose) {
1376           getContext()->getStrDump() << "\tFailed to match extractelements.\n";
1377         }
1378         continue;
1379       }
1380       if (Verbose) {
1381         getContext()->getStrDump()
1382             << "\tFound the following insert/extract element pairs: \n";
1383         for (SizeT I = 0; I < Inserts.size(); ++I) {
1384           const SizeT Pos = Inserts.size() - I - 1;
1385           getContext()->getStrDump() << "\t\tInsert : ";
1386           Inserts[Pos]->dump(this);
1387           getContext()->getStrDump() << "\n\t\tExtract: ";
1388           Extracts[Pos]->dump(this);
1389           getContext()->getStrDump() << "\n";
1390         }
1391       }
1392 
1393       assert(Src0 != nullptr);
1394       assert(Src1 != nullptr);
1395 
1396       auto *ShuffleVector =
1397           InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
1398       assert(ShuffleVector->getSrc(0) == Src0);
1399       assert(ShuffleVector->getSrc(1) == Src1);
1400       for (SizeT I = 0; I < Extracts.size(); ++I) {
1401         const SizeT Pos = Extracts.size() - I - 1;
1402         auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
1403         if (Src0 == Extracts[Pos]->getSrc(0)) {
1404           ShuffleVector->addIndex(Index);
1405         } else {
1406           ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
1407               Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
1408         }
1409       }
1410 
1411       if (Verbose) {
1412         getContext()->getStrDump() << "Created: ";
1413         ShuffleVector->dump(this);
1414         getContext()->getStrDump() << "\n";
1415       }
1416 
1417       Instr.setDeleted();
1418       auto &LoweringContext = getTarget()->getContext();
1419       LoweringContext.setInsertPoint(instToIterator(&Instr));
1420       LoweringContext.insert(ShuffleVector);
1421     }
1422   }
1423 }
1424 
genCode()1425 void Cfg::genCode() {
1426   TimerMarker T(TimerStack::TT_genCode, this);
1427   for (CfgNode *Node : Nodes)
1428     Node->genCode();
1429 }
1430 
1431 // Compute the stack frame layout.
genFrame()1432 void Cfg::genFrame() {
1433   TimerMarker T(TimerStack::TT_genFrame, this);
1434   getTarget()->addProlog(Entry);
1435   for (CfgNode *Node : Nodes)
1436     if (Node->getHasReturn())
1437       getTarget()->addEpilog(Node);
1438 }
1439 
generateLoopInfo()1440 void Cfg::generateLoopInfo() {
1441   TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
1442   LoopInfo = ComputeLoopInfo(this);
1443 }
1444 
1445 // This is a lightweight version of live-range-end calculation. Marks the last
1446 // use of only those variables whose definition and uses are completely with a
1447 // single block. It is a quick single pass and doesn't need to iterate until
1448 // convergence.
livenessLightweight()1449 void Cfg::livenessLightweight() {
1450   TimerMarker T(TimerStack::TT_livenessLightweight, this);
1451   getVMetadata()->init(VMK_Uses);
1452   for (CfgNode *Node : Nodes)
1453     Node->livenessLightweight();
1454 }
1455 
liveness(LivenessMode Mode)1456 void Cfg::liveness(LivenessMode Mode) {
1457   TimerMarker T(TimerStack::TT_liveness, this);
1458   // Destroying the previous (if any) Liveness information clears the Liveness
1459   // allocator TLS pointer.
1460   Live = nullptr;
1461   Live = Liveness::create(this, Mode);
1462 
1463   getVMetadata()->init(VMK_Uses);
1464   Live->init();
1465 
1466   // Initialize with all nodes needing to be processed.
1467   BitVector NeedToProcess(Nodes.size(), true);
1468   while (NeedToProcess.any()) {
1469     // Iterate in reverse topological order to speed up convergence.
1470     for (CfgNode *Node : reverse_range(Nodes)) {
1471       if (NeedToProcess[Node->getIndex()]) {
1472         NeedToProcess[Node->getIndex()] = false;
1473         bool Changed = Node->liveness(getLiveness());
1474         if (Changed) {
1475           // If the beginning-of-block liveness changed since the last
1476           // iteration, mark all in-edges as needing to be processed.
1477           for (CfgNode *Pred : Node->getInEdges())
1478             NeedToProcess[Pred->getIndex()] = true;
1479         }
1480       }
1481     }
1482   }
1483   if (Mode == Liveness_Intervals) {
1484     // Reset each variable's live range.
1485     for (Variable *Var : Variables)
1486       Var->resetLiveRange();
1487   }
1488   // Make a final pass over each node to delete dead instructions, collect the
1489   // first and last instruction numbers, and add live range segments for that
1490   // node.
1491   for (CfgNode *Node : Nodes) {
1492     InstNumberT FirstInstNum = Inst::NumberSentinel;
1493     InstNumberT LastInstNum = Inst::NumberSentinel;
1494     for (Inst &I : Node->getPhis()) {
1495       I.deleteIfDead();
1496       if (Mode == Liveness_Intervals && !I.isDeleted()) {
1497         if (FirstInstNum == Inst::NumberSentinel)
1498           FirstInstNum = I.getNumber();
1499         assert(I.getNumber() > LastInstNum);
1500         LastInstNum = I.getNumber();
1501       }
1502     }
1503     for (Inst &I : Node->getInsts()) {
1504       I.deleteIfDead();
1505       if (Mode == Liveness_Intervals && !I.isDeleted()) {
1506         if (FirstInstNum == Inst::NumberSentinel)
1507           FirstInstNum = I.getNumber();
1508         assert(I.getNumber() > LastInstNum);
1509         LastInstNum = I.getNumber();
1510       }
1511     }
1512     if (Mode == Liveness_Intervals) {
1513       // Special treatment for live in-args. Their liveness needs to extend
1514       // beyond the beginning of the function, otherwise an arg whose only use
1515       // is in the first instruction will end up having the trivial live range
1516       // [2,2) and will *not* interfere with other arguments. So if the first
1517       // instruction of the method is "r=arg1+arg2", both args may be assigned
1518       // the same register. This is accomplished by extending the entry block's
1519       // instruction range from [2,n) to [1,n) which will transform the
1520       // problematic [2,2) live ranges into [1,2).  This extension works because
1521       // the entry node is guaranteed to have the lowest instruction numbers.
1522       if (Node == getEntryNode()) {
1523         FirstInstNum = Inst::NumberExtended;
1524         // Just in case the entry node somehow contains no instructions...
1525         if (LastInstNum == Inst::NumberSentinel)
1526           LastInstNum = FirstInstNum;
1527       }
1528       // If this node somehow contains no instructions, don't bother trying to
1529       // add liveness intervals for it, because variables that are live-in and
1530       // live-out will have a bogus interval added.
1531       if (FirstInstNum != Inst::NumberSentinel)
1532         Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
1533     }
1534   }
1535 }
1536 
1537 // Traverse every Variable of every Inst and verify that it appears within the
1538 // Variable's computed live range.
validateLiveness() const1539 bool Cfg::validateLiveness() const {
1540   TimerMarker T(TimerStack::TT_validateLiveness, this);
1541   bool Valid = true;
1542   OstreamLocker L(Ctx);
1543   Ostream &Str = Ctx->getStrDump();
1544   for (CfgNode *Node : Nodes) {
1545     Inst *FirstInst = nullptr;
1546     for (Inst &Instr : Node->getInsts()) {
1547       if (Instr.isDeleted())
1548         continue;
1549       if (FirstInst == nullptr)
1550         FirstInst = &Instr;
1551       InstNumberT InstNumber = Instr.getNumber();
1552       if (Variable *Dest = Instr.getDest()) {
1553         if (!Dest->getIgnoreLiveness()) {
1554           bool Invalid = false;
1555           constexpr bool IsDest = true;
1556           if (!Dest->getLiveRange().containsValue(InstNumber, IsDest))
1557             Invalid = true;
1558           // Check that this instruction actually *begins* Dest's live range,
1559           // by checking that Dest is not live in the previous instruction. As
1560           // a special exception, we don't check this for the first instruction
1561           // of the block, because a Phi temporary may be live at the end of
1562           // the previous block, and if it is also assigned in the first
1563           // instruction of this block, the adjacent live ranges get merged.
1564           if (&Instr != FirstInst && !Instr.isDestRedefined() &&
1565               Dest->getLiveRange().containsValue(InstNumber - 1, IsDest))
1566             Invalid = true;
1567           if (Invalid) {
1568             Valid = false;
1569             Str << "Liveness error: inst " << Instr.getNumber() << " dest ";
1570             Dest->dump(this);
1571             Str << " live range " << Dest->getLiveRange() << "\n";
1572           }
1573         }
1574       }
1575       FOREACH_VAR_IN_INST(Var, Instr) {
1576         static constexpr bool IsDest = false;
1577         if (!Var->getIgnoreLiveness() &&
1578             !Var->getLiveRange().containsValue(InstNumber, IsDest)) {
1579           Valid = false;
1580           Str << "Liveness error: inst " << Instr.getNumber() << " var ";
1581           Var->dump(this);
1582           Str << " live range " << Var->getLiveRange() << "\n";
1583         }
1584       }
1585     }
1586   }
1587   return Valid;
1588 }
1589 
contractEmptyNodes()1590 void Cfg::contractEmptyNodes() {
1591   // If we're decorating the asm output with register liveness info, this
1592   // information may become corrupted or incorrect after contracting nodes that
1593   // contain only redundant assignments. As such, we disable this pass when
1594   // DecorateAsm is specified. This may make the resulting code look more
1595   // branchy, but it should have no effect on the register assignments.
1596   if (getFlags().getDecorateAsm())
1597     return;
1598   for (CfgNode *Node : Nodes) {
1599     Node->contractIfEmpty();
1600   }
1601 }
1602 
doBranchOpt()1603 void Cfg::doBranchOpt() {
1604   TimerMarker T(TimerStack::TT_doBranchOpt, this);
1605   for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
1606     auto NextNode = I + 1;
1607     (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode);
1608   }
1609 }
1610 
markNodesForSandboxing()1611 void Cfg::markNodesForSandboxing() {
1612   for (const InstJumpTable *JT : JumpTables)
1613     for (SizeT I = 0; I < JT->getNumTargets(); ++I)
1614       JT->getTarget(I)->setNeedsAlignment();
1615 }
1616 
1617 // ======================== Dump routines ======================== //
1618 
1619 // emitTextHeader() is not target-specific (apart from what is abstracted by
1620 // the Assembler), so it is defined here rather than in the target lowering
1621 // class.
emitTextHeader(GlobalString Name,GlobalContext * Ctx,const Assembler * Asm)1622 void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx,
1623                          const Assembler *Asm) {
1624   if (!BuildDefs::dump())
1625     return;
1626   Ostream &Str = Ctx->getStrEmit();
1627   Str << "\t.text\n";
1628   if (getFlags().getFunctionSections())
1629     Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n";
1630   if (!Asm->getInternal() || getFlags().getDisableInternal()) {
1631     Str << "\t.globl\t" << Name << "\n";
1632     Str << "\t.type\t" << Name << ",%function\n";
1633   }
1634   Str << "\t" << Asm->getAlignDirective() << " "
1635       << Asm->getBundleAlignLog2Bytes() << ",0x";
1636   for (uint8_t I : Asm->getNonExecBundlePadding())
1637     Str.write_hex(I);
1638   Str << "\n";
1639   Str << Name << ":\n";
1640 }
1641 
emitJumpTables()1642 void Cfg::emitJumpTables() {
1643   switch (getFlags().getOutFileType()) {
1644   case FT_Elf:
1645   case FT_Iasm: {
1646     // The emission needs to be delayed until the after the text section so
1647     // save the offsets in the global context.
1648     for (const InstJumpTable *JumpTable : JumpTables) {
1649       Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler()));
1650     }
1651   } break;
1652   case FT_Asm: {
1653     // Emit the assembly directly so we don't need to hang on to all the names
1654     for (const InstJumpTable *JumpTable : JumpTables)
1655       getTarget()->emitJumpTable(this, JumpTable);
1656   } break;
1657   }
1658 }
1659 
emit()1660 void Cfg::emit() {
1661   if (!BuildDefs::dump())
1662     return;
1663   TimerMarker T(TimerStack::TT_emitAsm, this);
1664   if (getFlags().getDecorateAsm()) {
1665     renumberInstructions();
1666     getVMetadata()->init(VMK_Uses);
1667     liveness(Liveness_Basic);
1668     dump("After recomputing liveness for -decorate-asm");
1669   }
1670   OstreamLocker L(Ctx);
1671   Ostream &Str = Ctx->getStrEmit();
1672   const Assembler *Asm = getAssembler<>();
1673   const bool NeedSandboxing = getFlags().getUseSandboxing();
1674 
1675   emitTextHeader(FunctionName, Ctx, Asm);
1676   if (getFlags().getDecorateAsm()) {
1677     for (Variable *Var : getVariables()) {
1678       if (Var->hasKnownStackOffset() && !Var->isRematerializable()) {
1679         Str << "\t" << Var->getSymbolicStackOffset() << " = "
1680             << Var->getStackOffset() << "\n";
1681       }
1682     }
1683   }
1684   for (CfgNode *Node : Nodes) {
1685     if (NeedSandboxing && Node->needsAlignment()) {
1686       Str << "\t" << Asm->getAlignDirective() << " "
1687           << Asm->getBundleAlignLog2Bytes() << "\n";
1688     }
1689     Node->emit(this);
1690   }
1691   emitJumpTables();
1692   Str << "\n";
1693 }
1694 
emitIAS()1695 void Cfg::emitIAS() {
1696   TimerMarker T(TimerStack::TT_emitAsm, this);
1697   // The emitIAS() routines emit into the internal assembler buffer, so there's
1698   // no need to lock the streams.
1699   const bool NeedSandboxing = getFlags().getUseSandboxing();
1700   for (CfgNode *Node : Nodes) {
1701     if (NeedSandboxing && Node->needsAlignment())
1702       getAssembler()->alignCfgNode();
1703     Node->emitIAS(this);
1704   }
1705   emitJumpTables();
1706 }
1707 
getTotalMemoryMB() const1708 size_t Cfg::getTotalMemoryMB() const {
1709   constexpr size_t _1MB = 1024 * 1024;
1710   assert(Allocator != nullptr);
1711   assert(CfgAllocatorTraits::current() == Allocator.get());
1712   return Allocator->getTotalMemory() / _1MB;
1713 }
1714 
getLivenessMemoryMB() const1715 size_t Cfg::getLivenessMemoryMB() const {
1716   constexpr size_t _1MB = 1024 * 1024;
1717   if (Live == nullptr) {
1718     return 0;
1719   }
1720   return Live->getAllocator()->getTotalMemory() / _1MB;
1721 }
1722 
1723 // Dumps the IR with an optional introductory message.
dump(const char * Message)1724 void Cfg::dump(const char *Message) {
1725   if (!BuildDefs::dump())
1726     return;
1727   if (!isVerbose())
1728     return;
1729   OstreamLocker L(Ctx);
1730   Ostream &Str = Ctx->getStrDump();
1731   if (Message[0])
1732     Str << "================ " << Message << " ================\n";
1733   if (isVerbose(IceV_Mem)) {
1734     Str << "Memory size = " << getTotalMemoryMB() << " MB\n";
1735   }
1736   setCurrentNode(getEntryNode());
1737   // Print function name+args
1738   if (isVerbose(IceV_Instructions)) {
1739     Str << "define ";
1740     if (getInternal() && !getFlags().getDisableInternal())
1741       Str << "internal ";
1742     Str << ReturnType << " @" << getFunctionName() << "(";
1743     for (SizeT i = 0; i < Args.size(); ++i) {
1744       if (i > 0)
1745         Str << ", ";
1746       Str << Args[i]->getType() << " ";
1747       Args[i]->dump(this);
1748     }
1749     // Append an extra copy of the function name here, in order to print its
1750     // size stats but not mess up lit tests.
1751     Str << ") { # " << getFunctionNameAndSize() << "\n";
1752   }
1753   resetCurrentNode();
1754   if (isVerbose(IceV_Liveness)) {
1755     // Print summary info about variables
1756     for (Variable *Var : Variables) {
1757       Str << "// multiblock=";
1758       if (getVMetadata()->isTracked(Var))
1759         Str << getVMetadata()->isMultiBlock(Var);
1760       else
1761         Str << "?";
1762       Str << " defs=";
1763       bool FirstPrint = true;
1764       if (VMetadata->getKind() != VMK_Uses) {
1765         if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) {
1766           Str << FirstDef->getNumber();
1767           FirstPrint = false;
1768         }
1769       }
1770       if (VMetadata->getKind() == VMK_All) {
1771         for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) {
1772           if (!FirstPrint)
1773             Str << ",";
1774           Str << Instr->getNumber();
1775           FirstPrint = false;
1776         }
1777       }
1778       Str << " weight=" << Var->getWeight(this) << " ";
1779       Var->dump(this);
1780       Str << " LIVE=" << Var->getLiveRange() << "\n";
1781     }
1782   }
1783   // Print each basic block
1784   for (CfgNode *Node : Nodes)
1785     Node->dump(this);
1786   if (isVerbose(IceV_Instructions))
1787     Str << "}\n";
1788 }
1789 
1790 } // end of namespace Ice
1791