1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// SI Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
16 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
17 
18 #include "SIInstrInfo.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineScheduler.h"
21 #include "llvm/CodeGen/RegisterPressure.h"
22 #include "llvm/CodeGen/ScheduleDAG.h"
23 #include <cassert>
24 #include <cstdint>
25 #include <map>
26 #include <memory>
27 #include <set>
28 #include <vector>
29 
30 namespace llvm {
31 
32 enum SIScheduleCandReason {
33   NoCand,
34   RegUsage,
35   Latency,
36   Successor,
37   Depth,
38   NodeOrder
39 };
40 
41 struct SISchedulerCandidate {
42   // The reason for this candidate.
43   SIScheduleCandReason Reason = NoCand;
44 
45   // Set of reasons that apply to multiple candidates.
46   uint32_t RepeatReasonSet = 0;
47 
48   SISchedulerCandidate() = default;
49 
isRepeatSISchedulerCandidate50   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
setRepeatSISchedulerCandidate51   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
52 };
53 
54 class SIScheduleDAGMI;
55 class SIScheduleBlockCreator;
56 
57 enum SIScheduleBlockLinkKind {
58   NoData,
59   Data
60 };
61 
62 class SIScheduleBlock {
63   SIScheduleDAGMI *DAG;
64   SIScheduleBlockCreator *BC;
65 
66   std::vector<SUnit*> SUnits;
67   std::map<unsigned, unsigned> NodeNum2Index;
68   std::vector<SUnit*> TopReadySUs;
69   std::vector<SUnit*> ScheduledSUnits;
70 
71   /// The top of the unscheduled zone.
72   IntervalPressure TopPressure;
73   RegPressureTracker TopRPTracker;
74 
75   // Pressure: number of said class of registers needed to
76   // store the live virtual and real registers.
77   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
78   // Pressure of additional registers required inside the block.
79   std::vector<unsigned> InternalAdditionnalPressure;
80   // Pressure of input and output registers
81   std::vector<unsigned> LiveInPressure;
82   std::vector<unsigned> LiveOutPressure;
83   // Registers required by the block, and outputs.
84   // We do track only virtual registers.
85   // Note that some registers are not 32 bits,
86   // and thus the pressure is not equal
87   // to the number of live registers.
88   std::set<unsigned> LiveInRegs;
89   std::set<unsigned> LiveOutRegs;
90 
91   bool Scheduled = false;
92   bool HighLatencyBlock = false;
93 
94   std::vector<unsigned> HasLowLatencyNonWaitedParent;
95 
96   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
97   unsigned ID;
98 
99   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
100   // All blocks successors, and the kind of link
101   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
102   unsigned NumHighLatencySuccessors = 0;
103 
104 public:
SIScheduleBlock(SIScheduleDAGMI * DAG,SIScheduleBlockCreator * BC,unsigned ID)105   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
106                   unsigned ID):
107     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
108 
109   ~SIScheduleBlock() = default;
110 
getID()111   unsigned getID() const { return ID; }
112 
113   /// Functions for Block construction.
114   void addUnit(SUnit *SU);
115 
116   // When all SUs have been added.
117   void finalizeUnits();
118 
119   // Add block pred, which has instruction predecessor of SU.
120   void addPred(SIScheduleBlock *Pred);
121   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
122 
getPreds()123   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
124   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
getSuccs()125     getSuccs() const { return Succs; }
126 
127   unsigned Height;  // Maximum topdown path length to block without outputs
128   unsigned Depth;   // Maximum bottomup path length to block without inputs
129 
getNumHighLatencySuccessors()130   unsigned getNumHighLatencySuccessors() const {
131     return NumHighLatencySuccessors;
132   }
133 
isHighLatencyBlock()134   bool isHighLatencyBlock() { return HighLatencyBlock; }
135 
136   // This is approximative.
137   // Ideally should take into accounts some instructions (rcp, etc)
138   // are 4 times slower.
getCost()139   int getCost() { return SUnits.size(); }
140 
141   // The block Predecessors and Successors must be all registered
142   // before fastSchedule().
143   // Fast schedule with no particular requirement.
144   void fastSchedule();
145 
getScheduledUnits()146   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
147 
148   // Complete schedule that will try to minimize reg pressure and
149   // low latencies, and will fill liveins and liveouts.
150   // Needs all MIs to be grouped between BeginBlock and EndBlock.
151   // The MIs can be moved after the scheduling,
152   // it is just used to allow correct track of live registers.
153   void schedule(MachineBasicBlock::iterator BeginBlock,
154                 MachineBasicBlock::iterator EndBlock);
155 
isScheduled()156   bool isScheduled() { return Scheduled; }
157 
158   // Needs the block to be scheduled inside
159   // TODO: find a way to compute it.
getInternalAdditionnalRegUsage()160   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
161     return InternalAdditionnalPressure;
162   }
163 
getInRegs()164   std::set<unsigned> &getInRegs() { return LiveInRegs; }
getOutRegs()165   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
166 
167   void printDebug(bool Full);
168 
169 private:
170   struct SISchedCandidate : SISchedulerCandidate {
171     // The best SUnit candidate.
172     SUnit *SU = nullptr;
173 
174     unsigned SGPRUsage;
175     unsigned VGPRUsage;
176     bool IsLowLatency;
177     unsigned LowLatencyOffset;
178     bool HasLowLatencyNonWaitedParent;
179 
180     SISchedCandidate() = default;
181 
isValidSISchedCandidate182     bool isValid() const { return SU; }
183 
184     // Copy the status of another candidate without changing policy.
setBestSISchedCandidate185     void setBest(SISchedCandidate &Best) {
186       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
187       SU = Best.SU;
188       Reason = Best.Reason;
189       SGPRUsage = Best.SGPRUsage;
190       VGPRUsage = Best.VGPRUsage;
191       IsLowLatency = Best.IsLowLatency;
192       LowLatencyOffset = Best.LowLatencyOffset;
193       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
194     }
195   };
196 
197   void undoSchedule();
198 
199   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
200   void releaseSucc(SUnit *SU, SDep *SuccEdge);
201   // InOrOutBlock: restrict to links pointing inside the block (true),
202   // or restrict to links pointing outside the block (false).
203   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
204 
205   void nodeScheduled(SUnit *SU);
206   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
207   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
208   SUnit* pickNode();
209   void traceCandidate(const SISchedCandidate &Cand);
210   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
211                        MachineBasicBlock::iterator EndBlock);
212 };
213 
214 struct SIScheduleBlocks {
215   std::vector<SIScheduleBlock*> Blocks;
216   std::vector<int> TopDownIndex2Block;
217   std::vector<int> TopDownBlock2Index;
218 };
219 
220 enum SISchedulerBlockCreatorVariant {
221   LatenciesAlone,
222   LatenciesGrouped,
223   LatenciesAlonePlusConsecutive
224 };
225 
226 class SIScheduleBlockCreator {
227   SIScheduleDAGMI *DAG;
228   // unique_ptr handles freeing memory for us.
229   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
230   std::map<SISchedulerBlockCreatorVariant,
231            SIScheduleBlocks> Blocks;
232   std::vector<SIScheduleBlock*> CurrentBlocks;
233   std::vector<int> Node2CurrentBlock;
234 
235   // Topological sort
236   // Maps topological index to the node number.
237   std::vector<int> TopDownIndex2Block;
238   std::vector<int> TopDownBlock2Index;
239   std::vector<int> BottomUpIndex2Block;
240 
241   // 0 -> Color not given.
242   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
243   // Above -> Other groups.
244   int NextReservedID;
245   int NextNonReservedID;
246   std::vector<int> CurrentColoring;
247   std::vector<int> CurrentTopDownReservedDependencyColoring;
248   std::vector<int> CurrentBottomUpReservedDependencyColoring;
249 
250 public:
251   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
252   ~SIScheduleBlockCreator();
253 
254   SIScheduleBlocks
255   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
256 
257   bool isSUInBlock(SUnit *SU, unsigned ID);
258 
259 private:
260   // Give a Reserved color to every high latency.
261   void colorHighLatenciesAlone();
262 
263   // Create groups of high latencies with a Reserved color.
264   void colorHighLatenciesGroups();
265 
266   // Compute coloring for topdown and bottom traversals with
267   // different colors depending on dependencies on Reserved colors.
268   void colorComputeReservedDependencies();
269 
270   // Give color to all non-colored SUs according to Reserved groups dependencies.
271   void colorAccordingToReservedDependencies();
272 
273   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
274   // The new colors are computed according to the dependencies on the other blocks
275   // formed with colorAccordingToReservedDependencies.
276   void colorEndsAccordingToDependencies();
277 
278   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
279   void colorForceConsecutiveOrderInGroup();
280 
281   // Merge Constant loads that have all their users into another group to the group.
282   // (TODO: else if all their users depend on the same group, put them there)
283   void colorMergeConstantLoadsNextGroup();
284 
285   // Merge SUs that have all their users into another group to the group
286   void colorMergeIfPossibleNextGroup();
287 
288   // Merge SUs that have all their users into another group to the group,
289   // but only for Reserved groups.
290   void colorMergeIfPossibleNextGroupOnlyForReserved();
291 
292   // Merge SUs that have all their users into another group to the group,
293   // but only if the group is no more than a few SUs.
294   void colorMergeIfPossibleSmallGroupsToNextGroup();
295 
296   // Divides Blocks with important size.
297   // Idea of implementation: attribute new colors depending on topdown and
298   // bottom up links to other blocks.
299   void cutHugeBlocks();
300 
301   // Put in one group all instructions with no users in this scheduling region
302   // (we'd want these groups be at the end).
303   void regroupNoUserInstructions();
304 
305   // Give Reserved color to export instructions
306   void colorExports();
307 
308   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
309 
310   void topologicalSort();
311 
312   void scheduleInsideBlocks();
313 
314   void fillStats();
315 };
316 
317 enum SISchedulerBlockSchedulerVariant {
318   BlockLatencyRegUsage,
319   BlockRegUsageLatency,
320   BlockRegUsage
321 };
322 
323 class SIScheduleBlockScheduler {
324   SIScheduleDAGMI *DAG;
325   SISchedulerBlockSchedulerVariant Variant;
326   std::vector<SIScheduleBlock*> Blocks;
327 
328   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
329   std::set<unsigned> LiveRegs;
330   // Num of schedulable unscheduled blocks reading the register.
331   std::map<unsigned, unsigned> LiveRegsConsumers;
332 
333   std::vector<unsigned> LastPosHighLatencyParentScheduled;
334   int LastPosWaitedHighLatency;
335 
336   std::vector<SIScheduleBlock*> BlocksScheduled;
337   unsigned NumBlockScheduled;
338   std::vector<SIScheduleBlock*> ReadyBlocks;
339 
340   unsigned VregCurrentUsage;
341   unsigned SregCurrentUsage;
342 
343   // Currently is only approximation.
344   unsigned maxVregUsage;
345   unsigned maxSregUsage;
346 
347   std::vector<unsigned> BlockNumPredsLeft;
348   std::vector<unsigned> BlockNumSuccsLeft;
349 
350 public:
351   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
352                            SISchedulerBlockSchedulerVariant Variant,
353                            SIScheduleBlocks BlocksStruct);
354   ~SIScheduleBlockScheduler() = default;
355 
getBlocks()356   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
357 
getVGPRUsage()358   unsigned getVGPRUsage() { return maxVregUsage; }
getSGPRUsage()359   unsigned getSGPRUsage() { return maxSregUsage; }
360 
361 private:
362   struct SIBlockSchedCandidate : SISchedulerCandidate {
363     // The best Block candidate.
364     SIScheduleBlock *Block = nullptr;
365 
366     bool IsHighLatency;
367     int VGPRUsageDiff;
368     unsigned NumSuccessors;
369     unsigned NumHighLatencySuccessors;
370     unsigned LastPosHighLatParentScheduled;
371     unsigned Height;
372 
373     SIBlockSchedCandidate() = default;
374 
isValidSIBlockSchedCandidate375     bool isValid() const { return Block; }
376 
377     // Copy the status of another candidate without changing policy.
setBestSIBlockSchedCandidate378     void setBest(SIBlockSchedCandidate &Best) {
379       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
380       Block = Best.Block;
381       Reason = Best.Reason;
382       IsHighLatency = Best.IsHighLatency;
383       VGPRUsageDiff = Best.VGPRUsageDiff;
384       NumSuccessors = Best.NumSuccessors;
385       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
386       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
387       Height = Best.Height;
388     }
389   };
390 
391   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
392                            SIBlockSchedCandidate &TryCand);
393   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
394                             SIBlockSchedCandidate &TryCand);
395   SIScheduleBlock *pickBlock();
396 
397   void addLiveRegs(std::set<unsigned> &Regs);
398   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
399   void releaseBlockSuccs(SIScheduleBlock *Parent);
400   void blockScheduled(SIScheduleBlock *Block);
401 
402   // Check register pressure change
403   // by scheduling a block with these LiveIn and LiveOut.
404   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
405                                        std::set<unsigned> &OutRegs);
406 
407   void schedule();
408 };
409 
410 struct SIScheduleBlockResult {
411   std::vector<unsigned> SUs;
412   unsigned MaxSGPRUsage;
413   unsigned MaxVGPRUsage;
414 };
415 
416 class SIScheduler {
417   SIScheduleDAGMI *DAG;
418   SIScheduleBlockCreator BlockCreator;
419 
420 public:
SIScheduler(SIScheduleDAGMI * DAG)421   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
422 
423   ~SIScheduler() = default;
424 
425   struct SIScheduleBlockResult
426   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
427                   SISchedulerBlockSchedulerVariant ScheduleVariant);
428 };
429 
430 class SIScheduleDAGMI final : public ScheduleDAGMILive {
431   const SIInstrInfo *SITII;
432   const SIRegisterInfo *SITRI;
433 
434   std::vector<SUnit> SUnitsLinksBackup;
435 
436   // For moveLowLatencies. After all Scheduling variants are tested.
437   std::vector<unsigned> ScheduledSUnits;
438   std::vector<unsigned> ScheduledSUnitsInv;
439 
440   unsigned VGPRSetID;
441   unsigned SGPRSetID;
442 
443 public:
444   SIScheduleDAGMI(MachineSchedContext *C);
445 
446   ~SIScheduleDAGMI() override;
447 
448   // Entry point for the schedule.
449   void schedule() override;
450 
451   // To init Block's RPTracker.
initRPTracker(RegPressureTracker & RPTracker)452   void initRPTracker(RegPressureTracker &RPTracker) {
453     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
454   }
455 
getBB()456   MachineBasicBlock *getBB() { return BB; }
getCurrentTop()457   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
getCurrentBottom()458   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
getLIS()459   LiveIntervals *getLIS() { return LIS; }
getMRI()460   MachineRegisterInfo *getMRI() { return &MRI; }
getTRI()461   const TargetRegisterInfo *getTRI() { return TRI; }
GetTopo()462   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
getEntrySU()463   SUnit& getEntrySU() { return EntrySU; }
getExitSU()464   SUnit& getExitSU() { return ExitSU; }
465 
466   void restoreSULinksLeft();
467 
468   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
469                                                      _Iterator End,
470                                                      unsigned &VgprUsage,
471                                                      unsigned &SgprUsage);
472 
getInRegs()473   std::set<unsigned> getInRegs() {
474     std::set<unsigned> InRegs;
475     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
476       InRegs.insert(RegMaskPair.RegUnit);
477     }
478     return InRegs;
479   }
480 
getOutRegs()481   std::set<unsigned> getOutRegs() {
482     std::set<unsigned> OutRegs;
483     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
484       OutRegs.insert(RegMaskPair.RegUnit);
485     }
486     return OutRegs;
487   };
488 
getVGPRSetID()489   unsigned getVGPRSetID() const { return VGPRSetID; }
getSGPRSetID()490   unsigned getSGPRSetID() const { return SGPRSetID; }
491 
492 private:
493   void topologicalSort();
494   // After scheduling is done, improve low latency placements.
495   void moveLowLatencies();
496 
497 public:
498   // Some stats for scheduling inside blocks.
499   std::vector<unsigned> IsLowLatencySU;
500   std::vector<unsigned> LowLatencyOffset;
501   std::vector<unsigned> IsHighLatencySU;
502   // Topological sort
503   // Maps topological index to the node number.
504   std::vector<int> TopDownIndex2SU;
505   std::vector<int> BottomUpIndex2SU;
506 };
507 
508 } // end namespace llvm
509 
510 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
511