1 //===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
16 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
17 
18 #include "SIInstrInfo.h"
19 #include "llvm/CodeGen/MachineScheduler.h"
20 #include "llvm/CodeGen/RegisterPressure.h"
21 
22 using namespace llvm;
23 
24 namespace llvm {
25 
26 enum SIScheduleCandReason {
27   NoCand,
28   RegUsage,
29   Latency,
30   Successor,
31   Depth,
32   NodeOrder
33 };
34 
35 struct SISchedulerCandidate {
36   // The reason for this candidate.
37   SIScheduleCandReason Reason;
38 
39   // Set of reasons that apply to multiple candidates.
40   uint32_t RepeatReasonSet;
41 
SISchedulerCandidateSISchedulerCandidate42   SISchedulerCandidate()
43     :  Reason(NoCand), RepeatReasonSet(0) {}
44 
isRepeatSISchedulerCandidate45   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
setRepeatSISchedulerCandidate46   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
47 };
48 
49 class SIScheduleDAGMI;
50 class SIScheduleBlockCreator;
51 
52 class SIScheduleBlock {
53   SIScheduleDAGMI *DAG;
54   SIScheduleBlockCreator *BC;
55 
56   std::vector<SUnit*> SUnits;
57   std::map<unsigned, unsigned> NodeNum2Index;
58   std::vector<SUnit*> TopReadySUs;
59   std::vector<SUnit*> ScheduledSUnits;
60 
61   /// The top of the unscheduled zone.
62   IntervalPressure TopPressure;
63   RegPressureTracker TopRPTracker;
64 
65   // Pressure: number of said class of registers needed to
66   // store the live virtual and real registers.
67   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
68   // Pressure of additional registers required inside the block.
69   std::vector<unsigned> InternalAdditionnalPressure;
70   // Pressure of input and output registers
71   std::vector<unsigned> LiveInPressure;
72   std::vector<unsigned> LiveOutPressure;
73   // Registers required by the block, and outputs.
74   // We do track only virtual registers.
75   // Note that some registers are not 32 bits,
76   // and thus the pressure is not equal
77   // to the number of live registers.
78   std::set<unsigned> LiveInRegs;
79   std::set<unsigned> LiveOutRegs;
80 
81   bool Scheduled;
82   bool HighLatencyBlock;
83 
84   std::vector<unsigned> HasLowLatencyNonWaitedParent;
85 
86   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
87   unsigned ID;
88 
89   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
90   std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
91   unsigned NumHighLatencySuccessors;
92 
93 public:
SIScheduleBlock(SIScheduleDAGMI * DAG,SIScheduleBlockCreator * BC,unsigned ID)94   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
95                   unsigned ID):
96     DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
97     TopRPTracker(TopPressure), Scheduled(false),
98     HighLatencyBlock(false), ID(ID),
99     Preds(), Succs(), NumHighLatencySuccessors(0) {};
100 
~SIScheduleBlock()101   ~SIScheduleBlock() {};
102 
getID()103   unsigned getID() const { return ID; }
104 
105   /// Functions for Block construction.
106   void addUnit(SUnit *SU);
107 
108   // When all SUs have been added.
109   void finalizeUnits();
110 
111   // Add block pred, which has instruction predecessor of SU.
112   void addPred(SIScheduleBlock *Pred);
113   void addSucc(SIScheduleBlock *Succ);
114 
getPreds()115   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
getSuccs()116   const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
117 
118   unsigned Height;  // Maximum topdown path length to block without outputs
119   unsigned Depth;   // Maximum bottomup path length to block without inputs
120 
getNumHighLatencySuccessors()121   unsigned getNumHighLatencySuccessors() const {
122     return NumHighLatencySuccessors;
123   }
124 
isHighLatencyBlock()125   bool isHighLatencyBlock() { return HighLatencyBlock; }
126 
127   // This is approximative.
128   // Ideally should take into accounts some instructions (rcp, etc)
129   // are 4 times slower.
getCost()130   int getCost() { return SUnits.size(); }
131 
132   // The block Predecessors and Successors must be all registered
133   // before fastSchedule().
134   // Fast schedule with no particular requirement.
135   void fastSchedule();
136 
getScheduledUnits()137   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
138 
139   // Complete schedule that will try to minimize reg pressure and
140   // low latencies, and will fill liveins and liveouts.
141   // Needs all MIs to be grouped between BeginBlock and EndBlock.
142   // The MIs can be moved after the scheduling,
143   // it is just used to allow correct track of live registers.
144   void schedule(MachineBasicBlock::iterator BeginBlock,
145                 MachineBasicBlock::iterator EndBlock);
146 
isScheduled()147   bool isScheduled() { return Scheduled; }
148 
149 
150   // Needs the block to be scheduled inside
151   // TODO: find a way to compute it.
getInternalAdditionnalRegUsage()152   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
153     return InternalAdditionnalPressure;
154   }
155 
getInRegs()156   std::set<unsigned> &getInRegs() { return LiveInRegs; }
getOutRegs()157   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
158 
159   void printDebug(bool Full);
160 
161 private:
162   struct SISchedCandidate : SISchedulerCandidate {
163     // The best SUnit candidate.
164     SUnit *SU;
165 
166     unsigned SGPRUsage;
167     unsigned VGPRUsage;
168     bool IsLowLatency;
169     unsigned LowLatencyOffset;
170     bool HasLowLatencyNonWaitedParent;
171 
SISchedCandidateSISchedCandidate172     SISchedCandidate()
173       : SU(nullptr) {}
174 
isValidSISchedCandidate175     bool isValid() const { return SU; }
176 
177     // Copy the status of another candidate without changing policy.
setBestSISchedCandidate178     void setBest(SISchedCandidate &Best) {
179       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
180       SU = Best.SU;
181       Reason = Best.Reason;
182       SGPRUsage = Best.SGPRUsage;
183       VGPRUsage = Best.VGPRUsage;
184       IsLowLatency = Best.IsLowLatency;
185       LowLatencyOffset = Best.LowLatencyOffset;
186       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
187     }
188   };
189 
190   void undoSchedule();
191 
192   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
193   void releaseSucc(SUnit *SU, SDep *SuccEdge);
194   // InOrOutBlock: restrict to links pointing inside the block (true),
195   // or restrict to links pointing outside the block (false).
196   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
197 
198   void nodeScheduled(SUnit *SU);
199   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
200   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
201   SUnit* pickNode();
202   void traceCandidate(const SISchedCandidate &Cand);
203   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
204                        MachineBasicBlock::iterator EndBlock);
205 };
206 
207 struct SIScheduleBlocks {
208   std::vector<SIScheduleBlock*> Blocks;
209   std::vector<int> TopDownIndex2Block;
210   std::vector<int> TopDownBlock2Index;
211 };
212 
213 enum SISchedulerBlockCreatorVariant {
214     LatenciesAlone,
215     LatenciesGrouped,
216     LatenciesAlonePlusConsecutive
217 };
218 
219 class SIScheduleBlockCreator {
220   SIScheduleDAGMI *DAG;
221   // unique_ptr handles freeing memory for us.
222   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
223   std::map<SISchedulerBlockCreatorVariant,
224            SIScheduleBlocks> Blocks;
225   std::vector<SIScheduleBlock*> CurrentBlocks;
226   std::vector<int> Node2CurrentBlock;
227 
228   // Topological sort
229   // Maps topological index to the node number.
230   std::vector<int> TopDownIndex2Block;
231   std::vector<int> TopDownBlock2Index;
232   std::vector<int> BottomUpIndex2Block;
233 
234   // 0 -> Color not given.
235   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
236   // Above -> Other groups.
237   int NextReservedID;
238   int NextNonReservedID;
239   std::vector<int> CurrentColoring;
240   std::vector<int> CurrentTopDownReservedDependencyColoring;
241   std::vector<int> CurrentBottomUpReservedDependencyColoring;
242 
243 public:
244   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
245   ~SIScheduleBlockCreator();
246 
247   SIScheduleBlocks
248   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
249 
250   bool isSUInBlock(SUnit *SU, unsigned ID);
251 
252 private:
253   // Give a Reserved color to every high latency.
254   void colorHighLatenciesAlone();
255 
256   // Create groups of high latencies with a Reserved color.
257   void colorHighLatenciesGroups();
258 
259   // Compute coloring for topdown and bottom traversals with
260   // different colors depending on dependencies on Reserved colors.
261   void colorComputeReservedDependencies();
262 
263   // Give color to all non-colored SUs according to Reserved groups dependencies.
264   void colorAccordingToReservedDependencies();
265 
266   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
267   // The new colors are computed according to the dependencies on the other blocks
268   // formed with colorAccordingToReservedDependencies.
269   void colorEndsAccordingToDependencies();
270 
271   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
272   void colorForceConsecutiveOrderInGroup();
273 
274   // Merge Constant loads that have all their users into another group to the group.
275   // (TODO: else if all their users depend on the same group, put them there)
276   void colorMergeConstantLoadsNextGroup();
277 
278   // Merge SUs that have all their users into another group to the group
279   void colorMergeIfPossibleNextGroup();
280 
281   // Merge SUs that have all their users into another group to the group,
282   // but only for Reserved groups.
283   void colorMergeIfPossibleNextGroupOnlyForReserved();
284 
285   // Merge SUs that have all their users into another group to the group,
286   // but only if the group is no more than a few SUs.
287   void colorMergeIfPossibleSmallGroupsToNextGroup();
288 
289   // Divides Blocks with important size.
290   // Idea of implementation: attribute new colors depending on topdown and
291   // bottom up links to other blocks.
292   void cutHugeBlocks();
293 
294   // Put in one group all instructions with no users in this scheduling region
295   // (we'd want these groups be at the end).
296   void regroupNoUserInstructions();
297 
298   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
299 
300   void topologicalSort();
301 
302   void scheduleInsideBlocks();
303 
304   void fillStats();
305 };
306 
307 enum SISchedulerBlockSchedulerVariant {
308   BlockLatencyRegUsage,
309   BlockRegUsageLatency,
310   BlockRegUsage
311 };
312 
313 class SIScheduleBlockScheduler {
314   SIScheduleDAGMI *DAG;
315   SISchedulerBlockSchedulerVariant Variant;
316   std::vector<SIScheduleBlock*> Blocks;
317 
318   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
319   std::set<unsigned> LiveRegs;
320   // Num of schedulable unscheduled blocks reading the register.
321   std::map<unsigned, unsigned> LiveRegsConsumers;
322 
323   std::vector<unsigned> LastPosHighLatencyParentScheduled;
324   int LastPosWaitedHighLatency;
325 
326   std::vector<SIScheduleBlock*> BlocksScheduled;
327   unsigned NumBlockScheduled;
328   std::vector<SIScheduleBlock*> ReadyBlocks;
329 
330   unsigned VregCurrentUsage;
331   unsigned SregCurrentUsage;
332 
333   // Currently is only approximation.
334   unsigned maxVregUsage;
335   unsigned maxSregUsage;
336 
337   std::vector<unsigned> BlockNumPredsLeft;
338   std::vector<unsigned> BlockNumSuccsLeft;
339 
340 public:
341   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
342                            SISchedulerBlockSchedulerVariant Variant,
343                            SIScheduleBlocks BlocksStruct);
~SIScheduleBlockScheduler()344   ~SIScheduleBlockScheduler() {};
345 
getBlocks()346   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
347 
getVGPRUsage()348   unsigned getVGPRUsage() { return maxVregUsage; };
getSGPRUsage()349   unsigned getSGPRUsage() { return maxSregUsage; };
350 
351 private:
352   struct SIBlockSchedCandidate : SISchedulerCandidate {
353     // The best Block candidate.
354     SIScheduleBlock *Block;
355 
356     bool IsHighLatency;
357     int VGPRUsageDiff;
358     unsigned NumSuccessors;
359     unsigned NumHighLatencySuccessors;
360     unsigned LastPosHighLatParentScheduled;
361     unsigned Height;
362 
SIBlockSchedCandidateSIBlockSchedCandidate363     SIBlockSchedCandidate()
364       : Block(nullptr) {}
365 
isValidSIBlockSchedCandidate366     bool isValid() const { return Block; }
367 
368     // Copy the status of another candidate without changing policy.
setBestSIBlockSchedCandidate369     void setBest(SIBlockSchedCandidate &Best) {
370       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
371       Block = Best.Block;
372       Reason = Best.Reason;
373       IsHighLatency = Best.IsHighLatency;
374       VGPRUsageDiff = Best.VGPRUsageDiff;
375       NumSuccessors = Best.NumSuccessors;
376       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
377       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
378       Height = Best.Height;
379     }
380   };
381 
382   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
383                            SIBlockSchedCandidate &TryCand);
384   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
385                             SIBlockSchedCandidate &TryCand);
386   SIScheduleBlock *pickBlock();
387 
388   void addLiveRegs(std::set<unsigned> &Regs);
389   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
390   void releaseBlockSuccs(SIScheduleBlock *Parent);
391   void blockScheduled(SIScheduleBlock *Block);
392 
393   // Check register pressure change
394   // by scheduling a block with these LiveIn and LiveOut.
395   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
396                                        std::set<unsigned> &OutRegs);
397 
398   void schedule();
399 };
400 
401 struct SIScheduleBlockResult {
402   std::vector<unsigned> SUs;
403   unsigned MaxSGPRUsage;
404   unsigned MaxVGPRUsage;
405 };
406 
407 class SIScheduler {
408   SIScheduleDAGMI *DAG;
409   SIScheduleBlockCreator BlockCreator;
410 
411 public:
SIScheduler(SIScheduleDAGMI * DAG)412   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
413 
~SIScheduler()414   ~SIScheduler() {};
415 
416   struct SIScheduleBlockResult
417   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
418                   SISchedulerBlockSchedulerVariant ScheduleVariant);
419 };
420 
421 class SIScheduleDAGMI final : public ScheduleDAGMILive {
422   const SIInstrInfo *SITII;
423   const SIRegisterInfo *SITRI;
424 
425   std::vector<SUnit> SUnitsLinksBackup;
426 
427   // For moveLowLatencies. After all Scheduling variants are tested.
428   std::vector<unsigned> ScheduledSUnits;
429   std::vector<unsigned> ScheduledSUnitsInv;
430 
431   unsigned VGPRSetID;
432   unsigned SGPRSetID;
433 
434 public:
435   SIScheduleDAGMI(MachineSchedContext *C);
436 
437   ~SIScheduleDAGMI() override;
438 
439   // Entry point for the schedule.
440   void schedule() override;
441 
442   // To init Block's RPTracker.
initRPTracker(RegPressureTracker & RPTracker)443   void initRPTracker(RegPressureTracker &RPTracker) {
444     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
445   }
446 
getBB()447   MachineBasicBlock *getBB() { return BB; }
getCurrentTop()448   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
getCurrentBottom()449   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
getLIS()450   LiveIntervals *getLIS() { return LIS; }
getMRI()451   MachineRegisterInfo *getMRI() { return &MRI; }
getTRI()452   const TargetRegisterInfo *getTRI() { return TRI; }
getEntrySU()453   SUnit& getEntrySU() { return EntrySU; };
getExitSU()454   SUnit& getExitSU() { return ExitSU; };
455 
456   void restoreSULinksLeft();
457 
458   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
459                                                      _Iterator End,
460                                                      unsigned &VgprUsage,
461                                                      unsigned &SgprUsage);
getInRegs()462   std::set<unsigned> getInRegs() {
463     std::set<unsigned> InRegs;
464     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
465       InRegs.insert(RegMaskPair.RegUnit);
466     }
467     return InRegs;
468   };
469 
getVGPRSetID()470   unsigned getVGPRSetID() const { return VGPRSetID; }
getSGPRSetID()471   unsigned getSGPRSetID() const { return SGPRSetID; }
472 
473 private:
474   void topologicalSort();
475   // After scheduling is done, improve low latency placements.
476   void moveLowLatencies();
477 
478 public:
479   // Some stats for scheduling inside blocks.
480   std::vector<unsigned> IsLowLatencySU;
481   std::vector<unsigned> LowLatencyOffset;
482   std::vector<unsigned> IsHighLatencySU;
483   // Topological sort
484   // Maps topological index to the node number.
485   std::vector<int> TopDownIndex2SU;
486   std::vector<int> BottomUpIndex2SU;
487 };
488 
489 } // namespace llvm
490 
491 #endif /* SIMACHINESCHEDULER_H_ */
492