1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "SIMachineScheduler.h"
17 #include "llvm/CodeGen/LiveInterval.h"
18 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
19 #include "llvm/CodeGen/MachineRegisterInfo.h"
20 #include "llvm/CodeGen/MachineScheduler.h"
21 #include "llvm/CodeGen/RegisterPressure.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "misched"
26
27 // This scheduler implements a different scheduling algorithm than
28 // GenericScheduler.
29 //
30 // There are several specific architecture behaviours that can't be modelled
31 // for GenericScheduler:
32 // . When accessing the result of an SGPR load instruction, you have to wait
33 // for all the SGPR load instructions before your current instruction to
34 // have finished.
35 // . When accessing the result of an VGPR load instruction, you have to wait
36 // for all the VGPR load instructions previous to the VGPR load instruction
37 // you are interested in to finish.
38 // . The less the register pressure, the best load latencies are hidden
39 //
40 // Moreover some specifities (like the fact a lot of instructions in the shader
41 // have few dependencies) makes the generic scheduler have some unpredictable
42 // behaviours. For example when register pressure becomes high, it can either
43 // manage to prevent register pressure from going too high, or it can
44 // increase register pressure even more than if it hadn't taken register
45 // pressure into account.
46 //
47 // Also some other bad behaviours are generated, like loading at the beginning
48 // of the shader a constant in VGPR you won't need until the end of the shader.
49 //
50 // The scheduling problem for SI can distinguish three main parts:
51 // . Hiding high latencies (texture sampling, etc)
52 // . Hiding low latencies (SGPR constant loading, etc)
53 // . Keeping register usage low for better latency hiding and general
54 // performance
55 //
56 // Some other things can also affect performance, but are hard to predict
57 // (cache usage, the fact the HW can issue several instructions from different
58 // wavefronts if different types, etc)
59 //
60 // This scheduler tries to solve the scheduling problem by dividing it into
61 // simpler sub-problems. It divides the instructions into blocks, schedules
62 // locally inside the blocks where it takes care of low latencies, and then
63 // chooses the order of the blocks by taking care of high latencies.
64 // Dividing the instructions into blocks helps control keeping register
65 // usage low.
66 //
67 // First the instructions are put into blocks.
68 // We want the blocks help control register usage and hide high latencies
69 // later. To help control register usage, we typically want all local
70 // computations, when for example you create a result that can be comsummed
71 // right away, to be contained in a block. Block inputs and outputs would
72 // typically be important results that are needed in several locations of
73 // the shader. Since we do want blocks to help hide high latencies, we want
74 // the instructions inside the block to have a minimal set of dependencies
75 // on high latencies. It will make it easy to pick blocks to hide specific
76 // high latencies.
77 // The block creation algorithm is divided into several steps, and several
78 // variants can be tried during the scheduling process.
79 //
80 // Second the order of the instructions inside the blocks is choosen.
81 // At that step we do take into account only register usage and hiding
82 // low latency instructions
83 //
84 // Third the block order is choosen, there we try to hide high latencies
85 // and keep register usage low.
86 //
87 // After the third step, a pass is done to improve the hiding of low
88 // latencies.
89 //
90 // Actually when talking about 'low latency' or 'high latency' it includes
91 // both the latency to get the cache (or global mem) data go to the register,
92 // and the bandwith limitations.
93 // Increasing the number of active wavefronts helps hide the former, but it
94 // doesn't solve the latter, thus why even if wavefront count is high, we have
95 // to try have as many instructions hiding high latencies as possible.
96 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
97 // which is hidden by 10 instructions if the wavefront count is 10.
98
99 // Some figures taken from AMD docs:
100 // Both texture and constant L1 caches are 4-way associative with 64 bytes
101 // lines.
102 // Constant cache is shared with 4 CUs.
103 // For texture sampling, the address generation unit receives 4 texture
104 // addresses per cycle, thus we could expect texture sampling latency to be
105 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
106 // instructions in a wavefront group are executed every 4 cycles),
107 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
108 // of the CU do texture sampling too. (Don't take these figures too seriously,
109 // as I'm not 100% sure of the computation)
110 // Data exports should get similar latency.
111 // For constant loading, the cache is shader with 4 CUs.
112 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
113 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
114 // It means a simple s_buffer_load should take one instruction to hide, as
115 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
116 // cache line.
117 //
118 // As of today the driver doesn't preload the constants in cache, thus the
119 // first loads get extra latency. The doc says global memory access can be
120 // 300-600 cycles. We do not specially take that into account when scheduling
121 // As we expect the driver to be able to preload the constants soon.
122
123
124 // common code //
125
126 #ifndef NDEBUG
127
getReasonStr(SIScheduleCandReason Reason)128 static const char *getReasonStr(SIScheduleCandReason Reason) {
129 switch (Reason) {
130 case NoCand: return "NOCAND";
131 case RegUsage: return "REGUSAGE";
132 case Latency: return "LATENCY";
133 case Successor: return "SUCCESSOR";
134 case Depth: return "DEPTH";
135 case NodeOrder: return "ORDER";
136 }
137 llvm_unreachable("Unknown reason!");
138 }
139
140 #endif
141
tryLess(int TryVal,int CandVal,SISchedulerCandidate & TryCand,SISchedulerCandidate & Cand,SIScheduleCandReason Reason)142 static bool tryLess(int TryVal, int CandVal,
143 SISchedulerCandidate &TryCand,
144 SISchedulerCandidate &Cand,
145 SIScheduleCandReason Reason) {
146 if (TryVal < CandVal) {
147 TryCand.Reason = Reason;
148 return true;
149 }
150 if (TryVal > CandVal) {
151 if (Cand.Reason > Reason)
152 Cand.Reason = Reason;
153 return true;
154 }
155 Cand.setRepeat(Reason);
156 return false;
157 }
158
tryGreater(int TryVal,int CandVal,SISchedulerCandidate & TryCand,SISchedulerCandidate & Cand,SIScheduleCandReason Reason)159 static bool tryGreater(int TryVal, int CandVal,
160 SISchedulerCandidate &TryCand,
161 SISchedulerCandidate &Cand,
162 SIScheduleCandReason Reason) {
163 if (TryVal > CandVal) {
164 TryCand.Reason = Reason;
165 return true;
166 }
167 if (TryVal < CandVal) {
168 if (Cand.Reason > Reason)
169 Cand.Reason = Reason;
170 return true;
171 }
172 Cand.setRepeat(Reason);
173 return false;
174 }
175
176 // SIScheduleBlock //
177
addUnit(SUnit * SU)178 void SIScheduleBlock::addUnit(SUnit *SU) {
179 NodeNum2Index[SU->NodeNum] = SUnits.size();
180 SUnits.push_back(SU);
181 }
182
183 #ifndef NDEBUG
184
traceCandidate(const SISchedCandidate & Cand)185 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
186
187 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
188 dbgs() << '\n';
189 }
190 #endif
191
tryCandidateTopDown(SISchedCandidate & Cand,SISchedCandidate & TryCand)192 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
193 SISchedCandidate &TryCand) {
194 // Initialize the candidate if needed.
195 if (!Cand.isValid()) {
196 TryCand.Reason = NodeOrder;
197 return;
198 }
199
200 if (Cand.SGPRUsage > 60 &&
201 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
202 return;
203
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
210 // . Low latencies
211 // . All other instructions
212 // Goal is to get: low latency instructions - independant instructions
213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
218 // loading more.
219 if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
220 Cand.HasLowLatencyNonWaitedParent,
221 TryCand, Cand, SIScheduleCandReason::Depth))
222 return;
223
224 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225 TryCand, Cand, SIScheduleCandReason::Depth))
226 return;
227
228 if (TryCand.IsLowLatency &&
229 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230 TryCand, Cand, SIScheduleCandReason::Depth))
231 return;
232
233 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
234 return;
235
236 // Fall through to original instruction order.
237 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
238 TryCand.Reason = NodeOrder;
239 }
240 }
241
pickNode()242 SUnit* SIScheduleBlock::pickNode() {
243 SISchedCandidate TopCand;
244
245 for (SUnit* SU : TopReadySUs) {
246 SISchedCandidate TryCand;
247 std::vector<unsigned> pressure;
248 std::vector<unsigned> MaxPressure;
249 // Predict register usage after this instruction.
250 TryCand.SU = SU;
251 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
252 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
253 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
254 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
255 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
256 TryCand.HasLowLatencyNonWaitedParent =
257 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
258 tryCandidateTopDown(TopCand, TryCand);
259 if (TryCand.Reason != NoCand)
260 TopCand.setBest(TryCand);
261 }
262
263 return TopCand.SU;
264 }
265
266
267 // Schedule something valid.
fastSchedule()268 void SIScheduleBlock::fastSchedule() {
269 TopReadySUs.clear();
270 if (Scheduled)
271 undoSchedule();
272
273 for (SUnit* SU : SUnits) {
274 if (!SU->NumPredsLeft)
275 TopReadySUs.push_back(SU);
276 }
277
278 while (!TopReadySUs.empty()) {
279 SUnit *SU = TopReadySUs[0];
280 ScheduledSUnits.push_back(SU);
281 nodeScheduled(SU);
282 }
283
284 Scheduled = true;
285 }
286
287 // Returns if the register was set between first and last.
isDefBetween(unsigned Reg,SlotIndex First,SlotIndex Last,const MachineRegisterInfo * MRI,const LiveIntervals * LIS)288 static bool isDefBetween(unsigned Reg,
289 SlotIndex First, SlotIndex Last,
290 const MachineRegisterInfo *MRI,
291 const LiveIntervals *LIS) {
292 for (MachineRegisterInfo::def_instr_iterator
293 UI = MRI->def_instr_begin(Reg),
294 UE = MRI->def_instr_end(); UI != UE; ++UI) {
295 const MachineInstr* MI = &*UI;
296 if (MI->isDebugValue())
297 continue;
298 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
299 if (InstSlot >= First && InstSlot <= Last)
300 return true;
301 }
302 return false;
303 }
304
initRegPressure(MachineBasicBlock::iterator BeginBlock,MachineBasicBlock::iterator EndBlock)305 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
306 MachineBasicBlock::iterator EndBlock) {
307 IntervalPressure Pressure, BotPressure;
308 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
309 LiveIntervals *LIS = DAG->getLIS();
310 MachineRegisterInfo *MRI = DAG->getMRI();
311 DAG->initRPTracker(TopRPTracker);
312 DAG->initRPTracker(BotRPTracker);
313 DAG->initRPTracker(RPTracker);
314
315 // Goes though all SU. RPTracker captures what had to be alive for the SUs
316 // to execute, and what is still alive at the end.
317 for (SUnit* SU : ScheduledSUnits) {
318 RPTracker.setPos(SU->getInstr());
319 RPTracker.advance();
320 }
321
322 // Close the RPTracker to finalize live ins/outs.
323 RPTracker.closeRegion();
324
325 // Initialize the live ins and live outs.
326 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
327 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
328
329 // Do not Track Physical Registers, because it messes up.
330 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
331 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
332 LiveInRegs.insert(RegMaskPair.RegUnit);
333 }
334 LiveOutRegs.clear();
335 // There is several possibilities to distinguish:
336 // 1) Reg is not input to any instruction in the block, but is output of one
337 // 2) 1) + read in the block and not needed after it
338 // 3) 1) + read in the block but needed in another block
339 // 4) Reg is input of an instruction but another block will read it too
340 // 5) Reg is input of an instruction and then rewritten in the block.
341 // result is not read in the block (implies used in another block)
342 // 6) Reg is input of an instruction and then rewritten in the block.
343 // result is read in the block and not needed in another block
344 // 7) Reg is input of an instruction and then rewritten in the block.
345 // result is read in the block but also needed in another block
346 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
347 // We want LiveOutRegs to contain only Regs whose content will be read after
348 // in another block, and whose content was written in the current block,
349 // that is we want it to get 1, 3, 5, 7
350 // Since we made the MIs of a block to be packed all together before
351 // scheduling, then the LiveIntervals were correct, and the RPTracker was
352 // able to correctly handle 5 vs 6, 2 vs 3.
353 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
354 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
355 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
356 // The use of findDefBetween removes the case 4.
357 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
358 unsigned Reg = RegMaskPair.RegUnit;
359 if (TargetRegisterInfo::isVirtualRegister(Reg) &&
360 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
361 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
362 LIS)) {
363 LiveOutRegs.insert(Reg);
364 }
365 }
366
367 // Pressure = sum_alive_registers register size
368 // Internally llvm will represent some registers as big 128 bits registers
369 // for example, but they actually correspond to 4 actual 32 bits registers.
370 // Thus Pressure is not equal to num_alive_registers * constant.
371 LiveInPressure = TopPressure.MaxSetPressure;
372 LiveOutPressure = BotPressure.MaxSetPressure;
373
374 // Prepares TopRPTracker for top down scheduling.
375 TopRPTracker.closeTop();
376 }
377
schedule(MachineBasicBlock::iterator BeginBlock,MachineBasicBlock::iterator EndBlock)378 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
379 MachineBasicBlock::iterator EndBlock) {
380 if (!Scheduled)
381 fastSchedule();
382
383 // PreScheduling phase to set LiveIn and LiveOut.
384 initRegPressure(BeginBlock, EndBlock);
385 undoSchedule();
386
387 // Schedule for real now.
388
389 TopReadySUs.clear();
390
391 for (SUnit* SU : SUnits) {
392 if (!SU->NumPredsLeft)
393 TopReadySUs.push_back(SU);
394 }
395
396 while (!TopReadySUs.empty()) {
397 SUnit *SU = pickNode();
398 ScheduledSUnits.push_back(SU);
399 TopRPTracker.setPos(SU->getInstr());
400 TopRPTracker.advance();
401 nodeScheduled(SU);
402 }
403
404 // TODO: compute InternalAdditionnalPressure.
405 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
406
407 // Check everything is right.
408 #ifndef NDEBUG
409 assert(SUnits.size() == ScheduledSUnits.size() &&
410 TopReadySUs.empty());
411 for (SUnit* SU : SUnits) {
412 assert(SU->isScheduled &&
413 SU->NumPredsLeft == 0);
414 }
415 #endif
416
417 Scheduled = true;
418 }
419
undoSchedule()420 void SIScheduleBlock::undoSchedule() {
421 for (SUnit* SU : SUnits) {
422 SU->isScheduled = false;
423 for (SDep& Succ : SU->Succs) {
424 if (BC->isSUInBlock(Succ.getSUnit(), ID))
425 undoReleaseSucc(SU, &Succ);
426 }
427 }
428 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
429 ScheduledSUnits.clear();
430 Scheduled = false;
431 }
432
undoReleaseSucc(SUnit * SU,SDep * SuccEdge)433 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
434 SUnit *SuccSU = SuccEdge->getSUnit();
435
436 if (SuccEdge->isWeak()) {
437 ++SuccSU->WeakPredsLeft;
438 return;
439 }
440 ++SuccSU->NumPredsLeft;
441 }
442
releaseSucc(SUnit * SU,SDep * SuccEdge)443 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
444 SUnit *SuccSU = SuccEdge->getSUnit();
445
446 if (SuccEdge->isWeak()) {
447 --SuccSU->WeakPredsLeft;
448 return;
449 }
450 #ifndef NDEBUG
451 if (SuccSU->NumPredsLeft == 0) {
452 dbgs() << "*** Scheduling failed! ***\n";
453 SuccSU->dump(DAG);
454 dbgs() << " has been released too many times!\n";
455 llvm_unreachable(nullptr);
456 }
457 #endif
458
459 --SuccSU->NumPredsLeft;
460 }
461
462 /// Release Successors of the SU that are in the block or not.
releaseSuccessors(SUnit * SU,bool InOrOutBlock)463 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
464 for (SDep& Succ : SU->Succs) {
465 SUnit *SuccSU = Succ.getSUnit();
466
467 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
468 continue;
469
470 releaseSucc(SU, &Succ);
471 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
472 TopReadySUs.push_back(SuccSU);
473 }
474 }
475
nodeScheduled(SUnit * SU)476 void SIScheduleBlock::nodeScheduled(SUnit *SU) {
477 // Is in TopReadySUs
478 assert (!SU->NumPredsLeft);
479 std::vector<SUnit*>::iterator I =
480 std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
481 if (I == TopReadySUs.end()) {
482 dbgs() << "Data Structure Bug in SI Scheduler\n";
483 llvm_unreachable(nullptr);
484 }
485 TopReadySUs.erase(I);
486
487 releaseSuccessors(SU, true);
488 // Scheduling this node will trigger a wait,
489 // thus propagate to other instructions that they do not need to wait either.
490 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
491 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
492
493 if (DAG->IsLowLatencySU[SU->NodeNum]) {
494 for (SDep& Succ : SU->Succs) {
495 std::map<unsigned, unsigned>::iterator I =
496 NodeNum2Index.find(Succ.getSUnit()->NodeNum);
497 if (I != NodeNum2Index.end())
498 HasLowLatencyNonWaitedParent[I->second] = 1;
499 }
500 }
501 SU->isScheduled = true;
502 }
503
finalizeUnits()504 void SIScheduleBlock::finalizeUnits() {
505 // We remove links from outside blocks to enable scheduling inside the block.
506 for (SUnit* SU : SUnits) {
507 releaseSuccessors(SU, false);
508 if (DAG->IsHighLatencySU[SU->NodeNum])
509 HighLatencyBlock = true;
510 }
511 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
512 }
513
514 // we maintain ascending order of IDs
addPred(SIScheduleBlock * Pred)515 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
516 unsigned PredID = Pred->getID();
517
518 // Check if not already predecessor.
519 for (SIScheduleBlock* P : Preds) {
520 if (PredID == P->getID())
521 return;
522 }
523 Preds.push_back(Pred);
524
525 assert(none_of(Succs,
526 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
527 "Loop in the Block Graph!");
528 }
529
addSucc(SIScheduleBlock * Succ)530 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
531 unsigned SuccID = Succ->getID();
532
533 // Check if not already predecessor.
534 for (SIScheduleBlock* S : Succs) {
535 if (SuccID == S->getID())
536 return;
537 }
538 if (Succ->isHighLatencyBlock())
539 ++NumHighLatencySuccessors;
540 Succs.push_back(Succ);
541 assert(none_of(Preds,
542 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
543 "Loop in the Block Graph!");
544 }
545
546 #ifndef NDEBUG
printDebug(bool full)547 void SIScheduleBlock::printDebug(bool full) {
548 dbgs() << "Block (" << ID << ")\n";
549 if (!full)
550 return;
551
552 dbgs() << "\nContains High Latency Instruction: "
553 << HighLatencyBlock << '\n';
554 dbgs() << "\nDepends On:\n";
555 for (SIScheduleBlock* P : Preds) {
556 P->printDebug(false);
557 }
558
559 dbgs() << "\nSuccessors:\n";
560 for (SIScheduleBlock* S : Succs) {
561 S->printDebug(false);
562 }
563
564 if (Scheduled) {
565 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
566 << LiveInPressure[DAG->getVGPRSetID()] << '\n';
567 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
568 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
569 dbgs() << "LiveIns:\n";
570 for (unsigned Reg : LiveInRegs)
571 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
572
573 dbgs() << "\nLiveOuts:\n";
574 for (unsigned Reg : LiveOutRegs)
575 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
576 }
577
578 dbgs() << "\nInstructions:\n";
579 if (!Scheduled) {
580 for (SUnit* SU : SUnits) {
581 SU->dump(DAG);
582 }
583 } else {
584 for (SUnit* SU : SUnits) {
585 SU->dump(DAG);
586 }
587 }
588
589 dbgs() << "///////////////////////\n";
590 }
591
592 #endif
593
594 // SIScheduleBlockCreator //
595
SIScheduleBlockCreator(SIScheduleDAGMI * DAG)596 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
597 DAG(DAG) {
598 }
599
~SIScheduleBlockCreator()600 SIScheduleBlockCreator::~SIScheduleBlockCreator() {
601 }
602
603 SIScheduleBlocks
getBlocks(SISchedulerBlockCreatorVariant BlockVariant)604 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
605 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
606 Blocks.find(BlockVariant);
607 if (B == Blocks.end()) {
608 SIScheduleBlocks Res;
609 createBlocksForVariant(BlockVariant);
610 topologicalSort();
611 scheduleInsideBlocks();
612 fillStats();
613 Res.Blocks = CurrentBlocks;
614 Res.TopDownIndex2Block = TopDownIndex2Block;
615 Res.TopDownBlock2Index = TopDownBlock2Index;
616 Blocks[BlockVariant] = Res;
617 return Res;
618 } else {
619 return B->second;
620 }
621 }
622
isSUInBlock(SUnit * SU,unsigned ID)623 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
624 if (SU->NodeNum >= DAG->SUnits.size())
625 return false;
626 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
627 }
628
colorHighLatenciesAlone()629 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
630 unsigned DAGSize = DAG->SUnits.size();
631
632 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
633 SUnit *SU = &DAG->SUnits[i];
634 if (DAG->IsHighLatencySU[SU->NodeNum]) {
635 CurrentColoring[SU->NodeNum] = NextReservedID++;
636 }
637 }
638 }
639
colorHighLatenciesGroups()640 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
641 unsigned DAGSize = DAG->SUnits.size();
642 unsigned NumHighLatencies = 0;
643 unsigned GroupSize;
644 unsigned Color = NextReservedID;
645 unsigned Count = 0;
646 std::set<unsigned> FormingGroup;
647
648 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
649 SUnit *SU = &DAG->SUnits[i];
650 if (DAG->IsHighLatencySU[SU->NodeNum])
651 ++NumHighLatencies;
652 }
653
654 if (NumHighLatencies == 0)
655 return;
656
657 if (NumHighLatencies <= 6)
658 GroupSize = 2;
659 else if (NumHighLatencies <= 12)
660 GroupSize = 3;
661 else
662 GroupSize = 4;
663
664 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
665 SUnit *SU = &DAG->SUnits[i];
666 if (DAG->IsHighLatencySU[SU->NodeNum]) {
667 unsigned CompatibleGroup = true;
668 unsigned ProposedColor = Color;
669 for (unsigned j : FormingGroup) {
670 // TODO: Currently CompatibleGroup will always be false,
671 // because the graph enforces the load order. This
672 // can be fixed, but as keeping the load order is often
673 // good for performance that causes a performance hit (both
674 // the default scheduler and this scheduler).
675 // When this scheduler determines a good load order,
676 // this can be fixed.
677 if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
678 !DAG->canAddEdge(&DAG->SUnits[j], SU))
679 CompatibleGroup = false;
680 }
681 if (!CompatibleGroup || ++Count == GroupSize) {
682 FormingGroup.clear();
683 Color = ++NextReservedID;
684 if (!CompatibleGroup) {
685 ProposedColor = Color;
686 FormingGroup.insert(SU->NodeNum);
687 }
688 Count = 0;
689 } else {
690 FormingGroup.insert(SU->NodeNum);
691 }
692 CurrentColoring[SU->NodeNum] = ProposedColor;
693 }
694 }
695 }
696
colorComputeReservedDependencies()697 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
698 unsigned DAGSize = DAG->SUnits.size();
699 std::map<std::set<unsigned>, unsigned> ColorCombinations;
700
701 CurrentTopDownReservedDependencyColoring.clear();
702 CurrentBottomUpReservedDependencyColoring.clear();
703
704 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
705 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
706
707 // Traverse TopDown, and give different colors to SUs depending
708 // on which combination of High Latencies they depend on.
709
710 for (unsigned SUNum : DAG->TopDownIndex2SU) {
711 SUnit *SU = &DAG->SUnits[SUNum];
712 std::set<unsigned> SUColors;
713
714 // Already given.
715 if (CurrentColoring[SU->NodeNum]) {
716 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
717 CurrentColoring[SU->NodeNum];
718 continue;
719 }
720
721 for (SDep& PredDep : SU->Preds) {
722 SUnit *Pred = PredDep.getSUnit();
723 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
724 continue;
725 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
726 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
727 }
728 // Color 0 by default.
729 if (SUColors.empty())
730 continue;
731 // Same color than parents.
732 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
733 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
734 *SUColors.begin();
735 else {
736 std::map<std::set<unsigned>, unsigned>::iterator Pos =
737 ColorCombinations.find(SUColors);
738 if (Pos != ColorCombinations.end()) {
739 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
740 } else {
741 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
742 NextNonReservedID;
743 ColorCombinations[SUColors] = NextNonReservedID++;
744 }
745 }
746 }
747
748 ColorCombinations.clear();
749
750 // Same as before, but BottomUp.
751
752 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
753 SUnit *SU = &DAG->SUnits[SUNum];
754 std::set<unsigned> SUColors;
755
756 // Already given.
757 if (CurrentColoring[SU->NodeNum]) {
758 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
759 CurrentColoring[SU->NodeNum];
760 continue;
761 }
762
763 for (SDep& SuccDep : SU->Succs) {
764 SUnit *Succ = SuccDep.getSUnit();
765 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
766 continue;
767 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
768 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
769 }
770 // Keep color 0.
771 if (SUColors.empty())
772 continue;
773 // Same color than parents.
774 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
775 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
776 *SUColors.begin();
777 else {
778 std::map<std::set<unsigned>, unsigned>::iterator Pos =
779 ColorCombinations.find(SUColors);
780 if (Pos != ColorCombinations.end()) {
781 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
782 } else {
783 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
784 NextNonReservedID;
785 ColorCombinations[SUColors] = NextNonReservedID++;
786 }
787 }
788 }
789 }
790
colorAccordingToReservedDependencies()791 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
792 unsigned DAGSize = DAG->SUnits.size();
793 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
794
795 // Every combination of colors given by the top down
796 // and bottom up Reserved node dependency
797
798 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
799 SUnit *SU = &DAG->SUnits[i];
800 std::pair<unsigned, unsigned> SUColors;
801
802 // High latency instructions: already given.
803 if (CurrentColoring[SU->NodeNum])
804 continue;
805
806 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
807 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
808
809 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
810 ColorCombinations.find(SUColors);
811 if (Pos != ColorCombinations.end()) {
812 CurrentColoring[SU->NodeNum] = Pos->second;
813 } else {
814 CurrentColoring[SU->NodeNum] = NextNonReservedID;
815 ColorCombinations[SUColors] = NextNonReservedID++;
816 }
817 }
818 }
819
colorEndsAccordingToDependencies()820 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
821 unsigned DAGSize = DAG->SUnits.size();
822 std::vector<int> PendingColoring = CurrentColoring;
823
824 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
825 SUnit *SU = &DAG->SUnits[SUNum];
826 std::set<unsigned> SUColors;
827 std::set<unsigned> SUColorsPending;
828
829 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
830 continue;
831
832 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
833 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
834 continue;
835
836 for (SDep& SuccDep : SU->Succs) {
837 SUnit *Succ = SuccDep.getSUnit();
838 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
839 continue;
840 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
841 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
842 SUColors.insert(CurrentColoring[Succ->NodeNum]);
843 SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
844 }
845 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
846 PendingColoring[SU->NodeNum] = *SUColors.begin();
847 else // TODO: Attribute new colors depending on color
848 // combination of children.
849 PendingColoring[SU->NodeNum] = NextNonReservedID++;
850 }
851 CurrentColoring = PendingColoring;
852 }
853
854
colorForceConsecutiveOrderInGroup()855 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
856 unsigned DAGSize = DAG->SUnits.size();
857 unsigned PreviousColor;
858 std::set<unsigned> SeenColors;
859
860 if (DAGSize <= 1)
861 return;
862
863 PreviousColor = CurrentColoring[0];
864
865 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
866 SUnit *SU = &DAG->SUnits[i];
867 unsigned CurrentColor = CurrentColoring[i];
868 unsigned PreviousColorSave = PreviousColor;
869 assert(i == SU->NodeNum);
870
871 if (CurrentColor != PreviousColor)
872 SeenColors.insert(PreviousColor);
873 PreviousColor = CurrentColor;
874
875 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
876 continue;
877
878 if (SeenColors.find(CurrentColor) == SeenColors.end())
879 continue;
880
881 if (PreviousColorSave != CurrentColor)
882 CurrentColoring[i] = NextNonReservedID++;
883 else
884 CurrentColoring[i] = CurrentColoring[i-1];
885 }
886 }
887
colorMergeConstantLoadsNextGroup()888 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
889 unsigned DAGSize = DAG->SUnits.size();
890
891 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
892 SUnit *SU = &DAG->SUnits[SUNum];
893 std::set<unsigned> SUColors;
894
895 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
896 continue;
897
898 // No predecessor: Vgpr constant loading.
899 // Low latency instructions usually have a predecessor (the address)
900 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
901 continue;
902
903 for (SDep& SuccDep : SU->Succs) {
904 SUnit *Succ = SuccDep.getSUnit();
905 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
906 continue;
907 SUColors.insert(CurrentColoring[Succ->NodeNum]);
908 }
909 if (SUColors.size() == 1)
910 CurrentColoring[SU->NodeNum] = *SUColors.begin();
911 }
912 }
913
colorMergeIfPossibleNextGroup()914 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
915 unsigned DAGSize = DAG->SUnits.size();
916
917 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
918 SUnit *SU = &DAG->SUnits[SUNum];
919 std::set<unsigned> SUColors;
920
921 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
922 continue;
923
924 for (SDep& SuccDep : SU->Succs) {
925 SUnit *Succ = SuccDep.getSUnit();
926 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
927 continue;
928 SUColors.insert(CurrentColoring[Succ->NodeNum]);
929 }
930 if (SUColors.size() == 1)
931 CurrentColoring[SU->NodeNum] = *SUColors.begin();
932 }
933 }
934
colorMergeIfPossibleNextGroupOnlyForReserved()935 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
936 unsigned DAGSize = DAG->SUnits.size();
937
938 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
939 SUnit *SU = &DAG->SUnits[SUNum];
940 std::set<unsigned> SUColors;
941
942 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
943 continue;
944
945 for (SDep& SuccDep : SU->Succs) {
946 SUnit *Succ = SuccDep.getSUnit();
947 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
948 continue;
949 SUColors.insert(CurrentColoring[Succ->NodeNum]);
950 }
951 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
952 CurrentColoring[SU->NodeNum] = *SUColors.begin();
953 }
954 }
955
colorMergeIfPossibleSmallGroupsToNextGroup()956 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
957 unsigned DAGSize = DAG->SUnits.size();
958 std::map<unsigned, unsigned> ColorCount;
959
960 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
961 SUnit *SU = &DAG->SUnits[SUNum];
962 unsigned color = CurrentColoring[SU->NodeNum];
963 std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
964 if (Pos != ColorCount.end()) {
965 ++ColorCount[color];
966 } else {
967 ColorCount[color] = 1;
968 }
969 }
970
971 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
972 SUnit *SU = &DAG->SUnits[SUNum];
973 unsigned color = CurrentColoring[SU->NodeNum];
974 std::set<unsigned> SUColors;
975
976 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
977 continue;
978
979 if (ColorCount[color] > 1)
980 continue;
981
982 for (SDep& SuccDep : SU->Succs) {
983 SUnit *Succ = SuccDep.getSUnit();
984 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
985 continue;
986 SUColors.insert(CurrentColoring[Succ->NodeNum]);
987 }
988 if (SUColors.size() == 1 && *SUColors.begin() != color) {
989 --ColorCount[color];
990 CurrentColoring[SU->NodeNum] = *SUColors.begin();
991 ++ColorCount[*SUColors.begin()];
992 }
993 }
994 }
995
cutHugeBlocks()996 void SIScheduleBlockCreator::cutHugeBlocks() {
997 // TODO
998 }
999
regroupNoUserInstructions()1000 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1001 unsigned DAGSize = DAG->SUnits.size();
1002 int GroupID = NextNonReservedID++;
1003
1004 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1005 SUnit *SU = &DAG->SUnits[SUNum];
1006 bool hasSuccessor = false;
1007
1008 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1009 continue;
1010
1011 for (SDep& SuccDep : SU->Succs) {
1012 SUnit *Succ = SuccDep.getSUnit();
1013 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1014 continue;
1015 hasSuccessor = true;
1016 }
1017 if (!hasSuccessor)
1018 CurrentColoring[SU->NodeNum] = GroupID;
1019 }
1020 }
1021
createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant)1022 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1023 unsigned DAGSize = DAG->SUnits.size();
1024 std::map<unsigned,unsigned> RealID;
1025
1026 CurrentBlocks.clear();
1027 CurrentColoring.clear();
1028 CurrentColoring.resize(DAGSize, 0);
1029 Node2CurrentBlock.clear();
1030
1031 // Restore links previous scheduling variant has overridden.
1032 DAG->restoreSULinksLeft();
1033
1034 NextReservedID = 1;
1035 NextNonReservedID = DAGSize + 1;
1036
1037 DEBUG(dbgs() << "Coloring the graph\n");
1038
1039 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1040 colorHighLatenciesGroups();
1041 else
1042 colorHighLatenciesAlone();
1043 colorComputeReservedDependencies();
1044 colorAccordingToReservedDependencies();
1045 colorEndsAccordingToDependencies();
1046 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1047 colorForceConsecutiveOrderInGroup();
1048 regroupNoUserInstructions();
1049 colorMergeConstantLoadsNextGroup();
1050 colorMergeIfPossibleNextGroupOnlyForReserved();
1051
1052 // Put SUs of same color into same block
1053 Node2CurrentBlock.resize(DAGSize, -1);
1054 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1055 SUnit *SU = &DAG->SUnits[i];
1056 unsigned Color = CurrentColoring[SU->NodeNum];
1057 if (RealID.find(Color) == RealID.end()) {
1058 int ID = CurrentBlocks.size();
1059 BlockPtrs.push_back(
1060 make_unique<SIScheduleBlock>(DAG, this, ID));
1061 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1062 RealID[Color] = ID;
1063 }
1064 CurrentBlocks[RealID[Color]]->addUnit(SU);
1065 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1066 }
1067
1068 // Build dependencies between blocks.
1069 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1070 SUnit *SU = &DAG->SUnits[i];
1071 int SUID = Node2CurrentBlock[i];
1072 for (SDep& SuccDep : SU->Succs) {
1073 SUnit *Succ = SuccDep.getSUnit();
1074 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1075 continue;
1076 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1077 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
1078 }
1079 for (SDep& PredDep : SU->Preds) {
1080 SUnit *Pred = PredDep.getSUnit();
1081 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1082 continue;
1083 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1084 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1085 }
1086 }
1087
1088 // Free root and leafs of all blocks to enable scheduling inside them.
1089 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1090 SIScheduleBlock *Block = CurrentBlocks[i];
1091 Block->finalizeUnits();
1092 }
1093 DEBUG(
1094 dbgs() << "Blocks created:\n\n";
1095 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1096 SIScheduleBlock *Block = CurrentBlocks[i];
1097 Block->printDebug(true);
1098 }
1099 );
1100 }
1101
1102 // Two functions taken from Codegen/MachineScheduler.cpp
1103
1104 /// If this iterator is a debug value, increment until reaching the End or a
1105 /// non-debug instruction.
1106 static MachineBasicBlock::const_iterator
nextIfDebug(MachineBasicBlock::const_iterator I,MachineBasicBlock::const_iterator End)1107 nextIfDebug(MachineBasicBlock::const_iterator I,
1108 MachineBasicBlock::const_iterator End) {
1109 for(; I != End; ++I) {
1110 if (!I->isDebugValue())
1111 break;
1112 }
1113 return I;
1114 }
1115
1116 /// Non-const version.
1117 static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,MachineBasicBlock::const_iterator End)1118 nextIfDebug(MachineBasicBlock::iterator I,
1119 MachineBasicBlock::const_iterator End) {
1120 // Cast the return value to nonconst MachineInstr, then cast to an
1121 // instr_iterator, which does not check for null, finally return a
1122 // bundle_iterator.
1123 return MachineBasicBlock::instr_iterator(
1124 const_cast<MachineInstr*>(
1125 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
1126 }
1127
topologicalSort()1128 void SIScheduleBlockCreator::topologicalSort() {
1129 unsigned DAGSize = CurrentBlocks.size();
1130 std::vector<int> WorkList;
1131
1132 DEBUG(dbgs() << "Topological Sort\n");
1133
1134 WorkList.reserve(DAGSize);
1135 TopDownIndex2Block.resize(DAGSize);
1136 TopDownBlock2Index.resize(DAGSize);
1137 BottomUpIndex2Block.resize(DAGSize);
1138
1139 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1140 SIScheduleBlock *Block = CurrentBlocks[i];
1141 unsigned Degree = Block->getSuccs().size();
1142 TopDownBlock2Index[i] = Degree;
1143 if (Degree == 0) {
1144 WorkList.push_back(i);
1145 }
1146 }
1147
1148 int Id = DAGSize;
1149 while (!WorkList.empty()) {
1150 int i = WorkList.back();
1151 SIScheduleBlock *Block = CurrentBlocks[i];
1152 WorkList.pop_back();
1153 TopDownBlock2Index[i] = --Id;
1154 TopDownIndex2Block[Id] = i;
1155 for (SIScheduleBlock* Pred : Block->getPreds()) {
1156 if (!--TopDownBlock2Index[Pred->getID()])
1157 WorkList.push_back(Pred->getID());
1158 }
1159 }
1160
1161 #ifndef NDEBUG
1162 // Check correctness of the ordering.
1163 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1164 SIScheduleBlock *Block = CurrentBlocks[i];
1165 for (SIScheduleBlock* Pred : Block->getPreds()) {
1166 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1167 "Wrong Top Down topological sorting");
1168 }
1169 }
1170 #endif
1171
1172 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1173 TopDownIndex2Block.rend());
1174 }
1175
scheduleInsideBlocks()1176 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1177 unsigned DAGSize = CurrentBlocks.size();
1178
1179 DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1180
1181 // We do schedule a valid scheduling such that a Block corresponds
1182 // to a range of instructions.
1183 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1184 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1185 SIScheduleBlock *Block = CurrentBlocks[i];
1186 Block->fastSchedule();
1187 }
1188
1189 // Note: the following code, and the part restoring previous position
1190 // is by far the most expensive operation of the Scheduler.
1191
1192 // Do not update CurrentTop.
1193 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1194 std::vector<MachineBasicBlock::iterator> PosOld;
1195 std::vector<MachineBasicBlock::iterator> PosNew;
1196 PosOld.reserve(DAG->SUnits.size());
1197 PosNew.reserve(DAG->SUnits.size());
1198
1199 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1200 int BlockIndice = TopDownIndex2Block[i];
1201 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1202 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1203
1204 for (SUnit* SU : SUs) {
1205 MachineInstr *MI = SU->getInstr();
1206 MachineBasicBlock::iterator Pos = MI;
1207 PosOld.push_back(Pos);
1208 if (&*CurrentTopFastSched == MI) {
1209 PosNew.push_back(Pos);
1210 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1211 DAG->getCurrentBottom());
1212 } else {
1213 // Update the instruction stream.
1214 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1215
1216 // Update LiveIntervals.
1217 // Note: Moving all instructions and calling handleMove everytime
1218 // is the most cpu intensive operation of the scheduler.
1219 // It would gain a lot if there was a way to recompute the
1220 // LiveIntervals for the entire scheduling region.
1221 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1222 PosNew.push_back(CurrentTopFastSched);
1223 }
1224 }
1225 }
1226
1227 // Now we have Block of SUs == Block of MI.
1228 // We do the final schedule for the instructions inside the block.
1229 // The property that all the SUs of the Block are grouped together as MI
1230 // is used for correct reg usage tracking.
1231 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1232 SIScheduleBlock *Block = CurrentBlocks[i];
1233 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1234 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1235 }
1236
1237 DEBUG(dbgs() << "Restoring MI Pos\n");
1238 // Restore old ordering (which prevents a LIS->handleMove bug).
1239 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1240 MachineBasicBlock::iterator POld = PosOld[i-1];
1241 MachineBasicBlock::iterator PNew = PosNew[i-1];
1242 if (PNew != POld) {
1243 // Update the instruction stream.
1244 DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1245
1246 // Update LiveIntervals.
1247 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1248 }
1249 }
1250
1251 DEBUG(
1252 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1253 SIScheduleBlock *Block = CurrentBlocks[i];
1254 Block->printDebug(true);
1255 }
1256 );
1257 }
1258
fillStats()1259 void SIScheduleBlockCreator::fillStats() {
1260 unsigned DAGSize = CurrentBlocks.size();
1261
1262 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1263 int BlockIndice = TopDownIndex2Block[i];
1264 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1265 if (Block->getPreds().size() == 0)
1266 Block->Depth = 0;
1267 else {
1268 unsigned Depth = 0;
1269 for (SIScheduleBlock *Pred : Block->getPreds()) {
1270 if (Depth < Pred->Depth + 1)
1271 Depth = Pred->Depth + 1;
1272 }
1273 Block->Depth = Depth;
1274 }
1275 }
1276
1277 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1278 int BlockIndice = BottomUpIndex2Block[i];
1279 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1280 if (Block->getSuccs().size() == 0)
1281 Block->Height = 0;
1282 else {
1283 unsigned Height = 0;
1284 for (SIScheduleBlock *Succ : Block->getSuccs()) {
1285 if (Height < Succ->Height + 1)
1286 Height = Succ->Height + 1;
1287 }
1288 Block->Height = Height;
1289 }
1290 }
1291 }
1292
1293 // SIScheduleBlockScheduler //
1294
SIScheduleBlockScheduler(SIScheduleDAGMI * DAG,SISchedulerBlockSchedulerVariant Variant,SIScheduleBlocks BlocksStruct)1295 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1296 SISchedulerBlockSchedulerVariant Variant,
1297 SIScheduleBlocks BlocksStruct) :
1298 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1299 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1300 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1301
1302 // Fill the usage of every output
1303 // Warning: while by construction we always have a link between two blocks
1304 // when one needs a result from the other, the number of users of an output
1305 // is not the sum of child blocks having as input the same virtual register.
1306 // Here is an example. A produces x and y. B eats x and produces x'.
1307 // C eats x' and y. The register coalescer may have attributed the same
1308 // virtual register to x and x'.
1309 // To count accurately, we do a topological sort. In case the register is
1310 // found for several parents, we increment the usage of the one with the
1311 // highest topological index.
1312 LiveOutRegsNumUsages.resize(Blocks.size());
1313 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1314 SIScheduleBlock *Block = Blocks[i];
1315 for (unsigned Reg : Block->getInRegs()) {
1316 bool Found = false;
1317 int topoInd = -1;
1318 for (SIScheduleBlock* Pred: Block->getPreds()) {
1319 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1320 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1321
1322 if (RegPos != PredOutRegs.end()) {
1323 Found = true;
1324 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1325 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1326 }
1327 }
1328 }
1329
1330 if (!Found)
1331 continue;
1332
1333 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1334 std::map<unsigned, unsigned>::iterator RegPos =
1335 LiveOutRegsNumUsages[PredID].find(Reg);
1336 if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
1337 ++LiveOutRegsNumUsages[PredID][Reg];
1338 } else {
1339 LiveOutRegsNumUsages[PredID][Reg] = 1;
1340 }
1341 }
1342 }
1343
1344 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1345 BlockNumPredsLeft.resize(Blocks.size());
1346 BlockNumSuccsLeft.resize(Blocks.size());
1347
1348 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1349 SIScheduleBlock *Block = Blocks[i];
1350 BlockNumPredsLeft[i] = Block->getPreds().size();
1351 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1352 }
1353
1354 #ifndef NDEBUG
1355 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1356 SIScheduleBlock *Block = Blocks[i];
1357 assert(Block->getID() == i);
1358 }
1359 #endif
1360
1361 std::set<unsigned> InRegs = DAG->getInRegs();
1362 addLiveRegs(InRegs);
1363
1364 // Fill LiveRegsConsumers for regs that were already
1365 // defined before scheduling.
1366 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1367 SIScheduleBlock *Block = Blocks[i];
1368 for (unsigned Reg : Block->getInRegs()) {
1369 bool Found = false;
1370 for (SIScheduleBlock* Pred: Block->getPreds()) {
1371 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1372 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1373
1374 if (RegPos != PredOutRegs.end()) {
1375 Found = true;
1376 break;
1377 }
1378 }
1379
1380 if (!Found) {
1381 if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
1382 LiveRegsConsumers[Reg] = 1;
1383 else
1384 ++LiveRegsConsumers[Reg];
1385 }
1386 }
1387 }
1388
1389 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1390 SIScheduleBlock *Block = Blocks[i];
1391 if (BlockNumPredsLeft[i] == 0) {
1392 ReadyBlocks.push_back(Block);
1393 }
1394 }
1395
1396 while (SIScheduleBlock *Block = pickBlock()) {
1397 BlocksScheduled.push_back(Block);
1398 blockScheduled(Block);
1399 }
1400
1401 DEBUG(
1402 dbgs() << "Block Order:";
1403 for (SIScheduleBlock* Block : BlocksScheduled) {
1404 dbgs() << ' ' << Block->getID();
1405 }
1406 );
1407 }
1408
tryCandidateLatency(SIBlockSchedCandidate & Cand,SIBlockSchedCandidate & TryCand)1409 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1410 SIBlockSchedCandidate &TryCand) {
1411 if (!Cand.isValid()) {
1412 TryCand.Reason = NodeOrder;
1413 return true;
1414 }
1415
1416 // Try to hide high latencies.
1417 if (tryLess(TryCand.LastPosHighLatParentScheduled,
1418 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1419 return true;
1420 // Schedule high latencies early so you can hide them better.
1421 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1422 TryCand, Cand, Latency))
1423 return true;
1424 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
1425 TryCand, Cand, Depth))
1426 return true;
1427 if (tryGreater(TryCand.NumHighLatencySuccessors,
1428 Cand.NumHighLatencySuccessors,
1429 TryCand, Cand, Successor))
1430 return true;
1431 return false;
1432 }
1433
tryCandidateRegUsage(SIBlockSchedCandidate & Cand,SIBlockSchedCandidate & TryCand)1434 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1435 SIBlockSchedCandidate &TryCand) {
1436 if (!Cand.isValid()) {
1437 TryCand.Reason = NodeOrder;
1438 return true;
1439 }
1440
1441 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1442 TryCand, Cand, RegUsage))
1443 return true;
1444 if (tryGreater(TryCand.NumSuccessors > 0,
1445 Cand.NumSuccessors > 0,
1446 TryCand, Cand, Successor))
1447 return true;
1448 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1449 return true;
1450 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1451 TryCand, Cand, RegUsage))
1452 return true;
1453 return false;
1454 }
1455
pickBlock()1456 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1457 SIBlockSchedCandidate Cand;
1458 std::vector<SIScheduleBlock*>::iterator Best;
1459 SIScheduleBlock *Block;
1460 if (ReadyBlocks.empty())
1461 return nullptr;
1462
1463 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1464 VregCurrentUsage, SregCurrentUsage);
1465 if (VregCurrentUsage > maxVregUsage)
1466 maxVregUsage = VregCurrentUsage;
1467 if (VregCurrentUsage > maxSregUsage)
1468 maxSregUsage = VregCurrentUsage;
1469 DEBUG(
1470 dbgs() << "Picking New Blocks\n";
1471 dbgs() << "Available: ";
1472 for (SIScheduleBlock* Block : ReadyBlocks)
1473 dbgs() << Block->getID() << ' ';
1474 dbgs() << "\nCurrent Live:\n";
1475 for (unsigned Reg : LiveRegs)
1476 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1477 dbgs() << '\n';
1478 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1479 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1480 );
1481
1482 Cand.Block = nullptr;
1483 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1484 E = ReadyBlocks.end(); I != E; ++I) {
1485 SIBlockSchedCandidate TryCand;
1486 TryCand.Block = *I;
1487 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1488 TryCand.VGPRUsageDiff =
1489 checkRegUsageImpact(TryCand.Block->getInRegs(),
1490 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1491 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1492 TryCand.NumHighLatencySuccessors =
1493 TryCand.Block->getNumHighLatencySuccessors();
1494 TryCand.LastPosHighLatParentScheduled =
1495 (unsigned int) std::max<int> (0,
1496 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1497 LastPosWaitedHighLatency);
1498 TryCand.Height = TryCand.Block->Height;
1499 // Try not to increase VGPR usage too much, else we may spill.
1500 if (VregCurrentUsage > 120 ||
1501 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1502 if (!tryCandidateRegUsage(Cand, TryCand) &&
1503 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1504 tryCandidateLatency(Cand, TryCand);
1505 } else {
1506 if (!tryCandidateLatency(Cand, TryCand))
1507 tryCandidateRegUsage(Cand, TryCand);
1508 }
1509 if (TryCand.Reason != NoCand) {
1510 Cand.setBest(TryCand);
1511 Best = I;
1512 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1513 << getReasonStr(Cand.Reason) << '\n');
1514 }
1515 }
1516
1517 DEBUG(
1518 dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1519 dbgs() << "Is a block with high latency instruction: "
1520 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1521 dbgs() << "Position of last high latency dependency: "
1522 << Cand.LastPosHighLatParentScheduled << '\n';
1523 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1524 dbgs() << '\n';
1525 );
1526
1527 Block = Cand.Block;
1528 ReadyBlocks.erase(Best);
1529 return Block;
1530 }
1531
1532 // Tracking of currently alive registers to determine VGPR Usage.
1533
addLiveRegs(std::set<unsigned> & Regs)1534 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1535 for (unsigned Reg : Regs) {
1536 // For now only track virtual registers.
1537 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1538 continue;
1539 // If not already in the live set, then add it.
1540 (void) LiveRegs.insert(Reg);
1541 }
1542 }
1543
decreaseLiveRegs(SIScheduleBlock * Block,std::set<unsigned> & Regs)1544 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1545 std::set<unsigned> &Regs) {
1546 for (unsigned Reg : Regs) {
1547 // For now only track virtual registers.
1548 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1549 assert (Pos != LiveRegs.end() && // Reg must be live.
1550 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1551 LiveRegsConsumers[Reg] >= 1);
1552 --LiveRegsConsumers[Reg];
1553 if (LiveRegsConsumers[Reg] == 0)
1554 LiveRegs.erase(Pos);
1555 }
1556 }
1557
releaseBlockSuccs(SIScheduleBlock * Parent)1558 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1559 for (SIScheduleBlock* Block : Parent->getSuccs()) {
1560 --BlockNumPredsLeft[Block->getID()];
1561 if (BlockNumPredsLeft[Block->getID()] == 0) {
1562 ReadyBlocks.push_back(Block);
1563 }
1564 // TODO: Improve check. When the dependency between the high latency
1565 // instructions and the instructions of the other blocks are WAR or WAW
1566 // there will be no wait triggered. We would like these cases to not
1567 // update LastPosHighLatencyParentScheduled.
1568 if (Parent->isHighLatencyBlock())
1569 LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
1570 }
1571 }
1572
blockScheduled(SIScheduleBlock * Block)1573 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1574 decreaseLiveRegs(Block, Block->getInRegs());
1575 addLiveRegs(Block->getOutRegs());
1576 releaseBlockSuccs(Block);
1577 for (std::map<unsigned, unsigned>::iterator RegI =
1578 LiveOutRegsNumUsages[Block->getID()].begin(),
1579 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1580 std::pair<unsigned, unsigned> RegP = *RegI;
1581 if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
1582 LiveRegsConsumers[RegP.first] = RegP.second;
1583 else {
1584 assert(LiveRegsConsumers[RegP.first] == 0);
1585 LiveRegsConsumers[RegP.first] += RegP.second;
1586 }
1587 }
1588 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1589 (unsigned)LastPosWaitedHighLatency)
1590 LastPosWaitedHighLatency =
1591 LastPosHighLatencyParentScheduled[Block->getID()];
1592 ++NumBlockScheduled;
1593 }
1594
1595 std::vector<int>
checkRegUsageImpact(std::set<unsigned> & InRegs,std::set<unsigned> & OutRegs)1596 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1597 std::set<unsigned> &OutRegs) {
1598 std::vector<int> DiffSetPressure;
1599 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1600
1601 for (unsigned Reg : InRegs) {
1602 // For now only track virtual registers.
1603 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1604 continue;
1605 if (LiveRegsConsumers[Reg] > 1)
1606 continue;
1607 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1608 for (; PSetI.isValid(); ++PSetI) {
1609 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1610 }
1611 }
1612
1613 for (unsigned Reg : OutRegs) {
1614 // For now only track virtual registers.
1615 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1616 continue;
1617 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1618 for (; PSetI.isValid(); ++PSetI) {
1619 DiffSetPressure[*PSetI] += PSetI.getWeight();
1620 }
1621 }
1622
1623 return DiffSetPressure;
1624 }
1625
1626 // SIScheduler //
1627
1628 struct SIScheduleBlockResult
scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,SISchedulerBlockSchedulerVariant ScheduleVariant)1629 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1630 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1631 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1632 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1633 std::vector<SIScheduleBlock*> ScheduledBlocks;
1634 struct SIScheduleBlockResult Res;
1635
1636 ScheduledBlocks = Scheduler.getBlocks();
1637
1638 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1639 SIScheduleBlock *Block = ScheduledBlocks[b];
1640 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1641
1642 for (SUnit* SU : SUs)
1643 Res.SUs.push_back(SU->NodeNum);
1644 }
1645
1646 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1647 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1648 return Res;
1649 }
1650
1651 // SIScheduleDAGMI //
1652
SIScheduleDAGMI(MachineSchedContext * C)1653 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1654 ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
1655 SITII = static_cast<const SIInstrInfo*>(TII);
1656 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1657
1658 VGPRSetID = SITRI->getVGPR32PressureSet();
1659 SGPRSetID = SITRI->getSGPR32PressureSet();
1660 }
1661
~SIScheduleDAGMI()1662 SIScheduleDAGMI::~SIScheduleDAGMI() {
1663 }
1664
createSIMachineScheduler(MachineSchedContext * C)1665 ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
1666 return new SIScheduleDAGMI(C);
1667 }
1668
1669 // Code adapted from scheduleDAG.cpp
1670 // Does a topological sort over the SUs.
1671 // Both TopDown and BottomUp
topologicalSort()1672 void SIScheduleDAGMI::topologicalSort() {
1673 Topo.InitDAGTopologicalSorting();
1674
1675 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1676 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1677 }
1678
1679 // Move low latencies further from their user without
1680 // increasing SGPR usage (in general)
1681 // This is to be replaced by a better pass that would
1682 // take into account SGPR usage (based on VGPR Usage
1683 // and the corresponding wavefront count), that would
1684 // try to merge groups of loads if it make sense, etc
moveLowLatencies()1685 void SIScheduleDAGMI::moveLowLatencies() {
1686 unsigned DAGSize = SUnits.size();
1687 int LastLowLatencyUser = -1;
1688 int LastLowLatencyPos = -1;
1689
1690 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1691 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1692 bool IsLowLatencyUser = false;
1693 unsigned MinPos = 0;
1694
1695 for (SDep& PredDep : SU->Preds) {
1696 SUnit *Pred = PredDep.getSUnit();
1697 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1698 IsLowLatencyUser = true;
1699 }
1700 if (Pred->NodeNum >= DAGSize)
1701 continue;
1702 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1703 if (PredPos >= MinPos)
1704 MinPos = PredPos + 1;
1705 }
1706
1707 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1708 unsigned BestPos = LastLowLatencyUser + 1;
1709 if ((int)BestPos <= LastLowLatencyPos)
1710 BestPos = LastLowLatencyPos + 1;
1711 if (BestPos < MinPos)
1712 BestPos = MinPos;
1713 if (BestPos < i) {
1714 for (unsigned u = i; u > BestPos; --u) {
1715 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1716 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1717 }
1718 ScheduledSUnits[BestPos] = SU->NodeNum;
1719 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1720 }
1721 LastLowLatencyPos = BestPos;
1722 if (IsLowLatencyUser)
1723 LastLowLatencyUser = BestPos;
1724 } else if (IsLowLatencyUser) {
1725 LastLowLatencyUser = i;
1726 // Moves COPY instructions on which depends
1727 // the low latency instructions too.
1728 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1729 bool CopyForLowLat = false;
1730 for (SDep& SuccDep : SU->Succs) {
1731 SUnit *Succ = SuccDep.getSUnit();
1732 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1733 CopyForLowLat = true;
1734 }
1735 }
1736 if (!CopyForLowLat)
1737 continue;
1738 if (MinPos < i) {
1739 for (unsigned u = i; u > MinPos; --u) {
1740 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1741 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1742 }
1743 ScheduledSUnits[MinPos] = SU->NodeNum;
1744 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1745 }
1746 }
1747 }
1748 }
1749
restoreSULinksLeft()1750 void SIScheduleDAGMI::restoreSULinksLeft() {
1751 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1752 SUnits[i].isScheduled = false;
1753 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1754 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1755 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1756 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1757 }
1758 }
1759
1760 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1761 template<typename _Iterator> void
fillVgprSgprCost(_Iterator First,_Iterator End,unsigned & VgprUsage,unsigned & SgprUsage)1762 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1763 unsigned &VgprUsage, unsigned &SgprUsage) {
1764 VgprUsage = 0;
1765 SgprUsage = 0;
1766 for (_Iterator RegI = First; RegI != End; ++RegI) {
1767 unsigned Reg = *RegI;
1768 // For now only track virtual registers
1769 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1770 continue;
1771 PSetIterator PSetI = MRI.getPressureSets(Reg);
1772 for (; PSetI.isValid(); ++PSetI) {
1773 if (*PSetI == VGPRSetID)
1774 VgprUsage += PSetI.getWeight();
1775 else if (*PSetI == SGPRSetID)
1776 SgprUsage += PSetI.getWeight();
1777 }
1778 }
1779 }
1780
schedule()1781 void SIScheduleDAGMI::schedule()
1782 {
1783 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1784 SIScheduleBlockResult Best, Temp;
1785 DEBUG(dbgs() << "Preparing Scheduling\n");
1786
1787 buildDAGWithRegPressure();
1788 DEBUG(
1789 for(SUnit& SU : SUnits)
1790 SU.dumpAll(this)
1791 );
1792
1793 topologicalSort();
1794 findRootsAndBiasEdges(TopRoots, BotRoots);
1795 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1796 // functions, but to make them happy we must initialize
1797 // the default Scheduler implementation (even if we do not
1798 // run it)
1799 SchedImpl->initialize(this);
1800 initQueues(TopRoots, BotRoots);
1801
1802 // Fill some stats to help scheduling.
1803
1804 SUnitsLinksBackup = SUnits;
1805 IsLowLatencySU.clear();
1806 LowLatencyOffset.clear();
1807 IsHighLatencySU.clear();
1808
1809 IsLowLatencySU.resize(SUnits.size(), 0);
1810 LowLatencyOffset.resize(SUnits.size(), 0);
1811 IsHighLatencySU.resize(SUnits.size(), 0);
1812
1813 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1814 SUnit *SU = &SUnits[i];
1815 unsigned BaseLatReg;
1816 int64_t OffLatReg;
1817 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1818 IsLowLatencySU[i] = 1;
1819 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1820 TRI))
1821 LowLatencyOffset[i] = OffLatReg;
1822 } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1823 IsHighLatencySU[i] = 1;
1824 }
1825
1826 SIScheduler Scheduler(this);
1827 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1828 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1829
1830 // if VGPR usage is extremely high, try other good performing variants
1831 // which could lead to lower VGPR usage
1832 if (Best.MaxVGPRUsage > 180) {
1833 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1834 { LatenciesAlone, BlockRegUsageLatency },
1835 // { LatenciesAlone, BlockRegUsage },
1836 { LatenciesGrouped, BlockLatencyRegUsage },
1837 // { LatenciesGrouped, BlockRegUsageLatency },
1838 // { LatenciesGrouped, BlockRegUsage },
1839 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1840 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1841 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1842 };
1843 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1844 Temp = Scheduler.scheduleVariant(v.first, v.second);
1845 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1846 Best = Temp;
1847 }
1848 }
1849 // if VGPR usage is still extremely high, we may spill. Try other variants
1850 // which are less performing, but that could lead to lower VGPR usage.
1851 if (Best.MaxVGPRUsage > 200) {
1852 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1853 // { LatenciesAlone, BlockRegUsageLatency },
1854 { LatenciesAlone, BlockRegUsage },
1855 // { LatenciesGrouped, BlockLatencyRegUsage },
1856 { LatenciesGrouped, BlockRegUsageLatency },
1857 { LatenciesGrouped, BlockRegUsage },
1858 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1859 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1860 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1861 };
1862 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1863 Temp = Scheduler.scheduleVariant(v.first, v.second);
1864 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1865 Best = Temp;
1866 }
1867 }
1868
1869 ScheduledSUnits = Best.SUs;
1870 ScheduledSUnitsInv.resize(SUnits.size());
1871
1872 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1873 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1874 }
1875
1876 moveLowLatencies();
1877
1878 // Tell the outside world about the result of the scheduling.
1879
1880 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1881 TopRPTracker.setPos(CurrentTop);
1882
1883 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
1884 E = ScheduledSUnits.end(); I != E; ++I) {
1885 SUnit *SU = &SUnits[*I];
1886
1887 scheduleMI(SU, true);
1888
1889 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1890 << *SU->getInstr());
1891 }
1892
1893 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1894
1895 placeDebugValues();
1896
1897 DEBUG({
1898 unsigned BBNum = begin()->getParent()->getNumber();
1899 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
1900 dumpSchedule();
1901 dbgs() << '\n';
1902 });
1903 }
1904