1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
18 #define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
19 
20 #include <fstream>
21 
22 #include "base/scoped_arena_allocator.h"
23 #include "base/scoped_arena_containers.h"
24 #include "base/time_utils.h"
25 #include "code_generator.h"
26 #include "load_store_analysis.h"
27 #include "nodes.h"
28 #include "optimization.h"
29 
30 namespace art {
31 
32 // General description of instruction scheduling.
33 //
34 // This pass tries to improve the quality of the generated code by reordering
35 // instructions in the graph to avoid execution delays caused by execution
36 // dependencies.
37 // Currently, scheduling is performed at the block level, so no `HInstruction`
38 // ever leaves its block in this pass.
39 //
40 // The scheduling process iterates through blocks in the graph. For blocks that
41 // we can and want to schedule:
42 // 1) Build a dependency graph for instructions.
43 //    It includes data dependencies (inputs/uses), but also environment
44 //    dependencies and side-effect dependencies.
45 // 2) Schedule the dependency graph.
46 //    This is a topological sort of the dependency graph, using heuristics to
47 //    decide what node to scheduler first when there are multiple candidates.
48 //
49 // A few factors impacting the quality of the scheduling are:
50 // - The heuristics used to decide what node to schedule in the topological sort
51 //   when there are multiple valid candidates. There is a wide range of
52 //   complexity possible here, going from a simple model only considering
53 //   latencies, to a super detailed CPU pipeline model.
54 // - Fewer dependencies in the dependency graph give more freedom for the
55 //   scheduling heuristics. For example de-aliasing can allow possibilities for
56 //   reordering of memory accesses.
57 // - The level of abstraction of the IR. It is easier to evaluate scheduling for
58 //   IRs that translate to a single assembly instruction than for IRs
59 //   that generate multiple assembly instructions or generate different code
60 //   depending on properties of the IR.
61 // - Scheduling is performed before register allocation, it is not aware of the
62 //   impact of moving instructions on register allocation.
63 //
64 //
65 // The scheduling code uses the terms predecessors, successors, and dependencies.
66 // This can be confusing at times, so here are clarifications.
67 // These terms are used from the point of view of the program dependency graph. So
68 // the inputs of an instruction are part of its dependencies, and hence part its
69 // predecessors. So the uses of an instruction are (part of) its successors.
70 // (Side-effect dependencies can yield predecessors or successors that are not
71 // inputs or uses.)
72 //
73 // Here is a trivial example. For the Java code:
74 //
75 //    int a = 1 + 2;
76 //
77 // we would have the instructions
78 //
79 //    i1 HIntConstant 1
80 //    i2 HIntConstant 2
81 //    i3 HAdd [i1,i2]
82 //
83 // `i1` and `i2` are predecessors of `i3`.
84 // `i3` is a successor of `i1` and a successor of `i2`.
85 // In a scheduling graph for this code we would have three nodes `n1`, `n2`,
86 // and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
87 // Conceptually the program dependency graph for this would contain two edges
88 //
89 //    n1 -> n3
90 //    n2 -> n3
91 //
92 // Since we schedule backwards (starting from the last instruction in each basic
93 // block), the implementation of nodes keeps a list of pointers their
94 // predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
95 //
96 // Node dependencies are also referred to from the program dependency graph
97 // point of view: we say that node `B` immediately depends on `A` if there is an
98 // edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
99 // `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
100 // `n2`.
101 // Since nodes in the scheduling graph keep a list of their predecessors, node
102 // `B` will have a pointer to its predecessor `A`.
103 // As we schedule backwards, `B` will be selected for scheduling before `A` is.
104 //
105 // So the scheduling for the example above could happen as follow
106 //
107 //    |---------------------------+------------------------|
108 //    | candidates for scheduling | instructions scheduled |
109 //    | --------------------------+------------------------|
110 //
111 // The only node without successors is `n3`, so it is the only initial
112 // candidate.
113 //
114 //    | n3                        | (none)                 |
115 //
116 // We schedule `n3` as the last (and only) instruction. All its predecessors
117 // that do not have any unscheduled successors become candidate. That is, `n1`
118 // and `n2` become candidates.
119 //
120 //    | n1, n2                    | n3                     |
121 //
122 // One of the candidates is selected. In practice this is where scheduling
123 // heuristics kick in, to decide which of the candidates should be selected.
124 // In this example, let it be `n1`. It is scheduled before previously scheduled
125 // nodes (in program order). There are no other nodes to add to the list of
126 // candidates.
127 //
128 //    | n2                        | n1                     |
129 //    |                           | n3                     |
130 //
131 // The only candidate available for scheduling is `n2`. Schedule it before
132 // (in program order) the previously scheduled nodes.
133 //
134 //    | (none)                    | n2                     |
135 //    |                           | n1                     |
136 //    |                           | n3                     |
137 //    |---------------------------+------------------------|
138 //
139 // So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
140 // In this trivial example, it does not matter which of `i1` and `i2` is
141 // scheduled first since they are constants. However the same process would
142 // apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
143 
144 // Set to true to have instruction scheduling dump scheduling graphs to the file
145 // `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
146 static constexpr bool kDumpDotSchedulingGraphs = false;
147 
148 // Typically used as a default instruction latency.
149 static constexpr uint32_t kGenericInstructionLatency = 1;
150 
151 class HScheduler;
152 
153 /**
154  * A node representing an `HInstruction` in the `SchedulingGraph`.
155  */
156 class SchedulingNode : public DeletableArenaObject<kArenaAllocScheduler> {
157  public:
SchedulingNode(HInstruction * instr,ScopedArenaAllocator * allocator,bool is_scheduling_barrier)158   SchedulingNode(HInstruction* instr, ScopedArenaAllocator* allocator, bool is_scheduling_barrier)
159       : latency_(0),
160         internal_latency_(0),
161         critical_path_(0),
162         instruction_(instr),
163         is_scheduling_barrier_(is_scheduling_barrier),
164         data_predecessors_(allocator->Adapter(kArenaAllocScheduler)),
165         other_predecessors_(allocator->Adapter(kArenaAllocScheduler)),
166         num_unscheduled_successors_(0) {
167     data_predecessors_.reserve(kPreallocatedPredecessors);
168   }
169 
AddDataPredecessor(SchedulingNode * predecessor)170   void AddDataPredecessor(SchedulingNode* predecessor) {
171     data_predecessors_.push_back(predecessor);
172     predecessor->num_unscheduled_successors_++;
173   }
174 
GetDataPredecessors()175   const ScopedArenaVector<SchedulingNode*>& GetDataPredecessors() const {
176     return data_predecessors_;
177   }
178 
AddOtherPredecessor(SchedulingNode * predecessor)179   void AddOtherPredecessor(SchedulingNode* predecessor) {
180     other_predecessors_.push_back(predecessor);
181     predecessor->num_unscheduled_successors_++;
182   }
183 
GetOtherPredecessors()184   const ScopedArenaVector<SchedulingNode*>& GetOtherPredecessors() const {
185     return other_predecessors_;
186   }
187 
DecrementNumberOfUnscheduledSuccessors()188   void DecrementNumberOfUnscheduledSuccessors() {
189     num_unscheduled_successors_--;
190   }
191 
MaybeUpdateCriticalPath(uint32_t other_critical_path)192   void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
193     critical_path_ = std::max(critical_path_, other_critical_path);
194   }
195 
HasUnscheduledSuccessors()196   bool HasUnscheduledSuccessors() const {
197     return num_unscheduled_successors_ != 0;
198   }
199 
GetInstruction()200   HInstruction* GetInstruction() const { return instruction_; }
GetLatency()201   uint32_t GetLatency() const { return latency_; }
SetLatency(uint32_t latency)202   void SetLatency(uint32_t latency) { latency_ = latency; }
GetInternalLatency()203   uint32_t GetInternalLatency() const { return internal_latency_; }
SetInternalLatency(uint32_t internal_latency)204   void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
GetCriticalPath()205   uint32_t GetCriticalPath() const { return critical_path_; }
IsSchedulingBarrier()206   bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
207 
208  private:
209   // The latency of this node. It represents the latency between the moment the
210   // last instruction for this node has executed to the moment the result
211   // produced by this node is available to users.
212   uint32_t latency_;
213   // This represents the time spent *within* the generated code for this node.
214   // It should be zero for nodes that only generate a single instruction.
215   uint32_t internal_latency_;
216 
217   // The critical path from this instruction to the end of scheduling. It is
218   // used by the scheduling heuristics to measure the priority of this instruction.
219   // It is defined as
220   //     critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
221   // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
222   // `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
223   uint32_t critical_path_;
224 
225   // The instruction that this node represents.
226   HInstruction* const instruction_;
227 
228   // If a node is scheduling barrier, other nodes cannot be scheduled before it.
229   const bool is_scheduling_barrier_;
230 
231   // The lists of predecessors. They cannot be scheduled before this node. Once
232   // this node is scheduled, we check whether any of its predecessors has become a
233   // valid candidate for scheduling.
234   // Predecessors in `data_predecessors_` are data dependencies. Those in
235   // `other_predecessors_` contain side-effect dependencies, environment
236   // dependencies, and scheduling barrier dependencies.
237   ScopedArenaVector<SchedulingNode*> data_predecessors_;
238   ScopedArenaVector<SchedulingNode*> other_predecessors_;
239 
240   // The number of unscheduled successors for this node. This number is
241   // decremented as successors are scheduled. When it reaches zero this node
242   // becomes a valid candidate to schedule.
243   uint32_t num_unscheduled_successors_;
244 
245   static constexpr size_t kPreallocatedPredecessors = 4;
246 };
247 
248 /*
249  * Directed acyclic graph for scheduling.
250  */
251 class SchedulingGraph : public ValueObject {
252  public:
SchedulingGraph(const HScheduler * scheduler,ScopedArenaAllocator * allocator,const HeapLocationCollector * heap_location_collector)253   SchedulingGraph(const HScheduler* scheduler,
254                   ScopedArenaAllocator* allocator,
255                   const HeapLocationCollector* heap_location_collector)
256       : scheduler_(scheduler),
257         allocator_(allocator),
258         contains_scheduling_barrier_(false),
259         nodes_map_(allocator_->Adapter(kArenaAllocScheduler)),
260         heap_location_collector_(heap_location_collector) {}
261 
262   SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
263     std::unique_ptr<SchedulingNode> node(
264         new (allocator_) SchedulingNode(instr, allocator_, is_scheduling_barrier));
265     SchedulingNode* result = node.get();
266     nodes_map_.insert(std::make_pair(instr, std::move(node)));
267     contains_scheduling_barrier_ |= is_scheduling_barrier;
268     AddDependencies(instr, is_scheduling_barrier);
269     return result;
270   }
271 
GetNode(const HInstruction * instr)272   SchedulingNode* GetNode(const HInstruction* instr) const {
273     auto it = nodes_map_.find(instr);
274     if (it == nodes_map_.end()) {
275       return nullptr;
276     } else {
277       return it->second.get();
278     }
279   }
280 
281   bool IsSchedulingBarrier(const HInstruction* instruction) const;
282 
283   bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
284   bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
285   bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
286   bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
287 
Size()288   size_t Size() const {
289     return nodes_map_.size();
290   }
291 
292   // Dump the scheduling graph, in dot file format, appending it to the file
293   // `scheduling_graphs.dot`.
294   void DumpAsDotGraph(const std::string& description,
295                       const ScopedArenaVector<SchedulingNode*>& initial_candidates);
296 
297  protected:
298   void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
AddDataDependency(SchedulingNode * node,SchedulingNode * dependency)299   void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
300     AddDependency(node, dependency, /*is_data_dependency*/true);
301   }
AddOtherDependency(SchedulingNode * node,SchedulingNode * dependency)302   void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
303     AddDependency(node, dependency, /*is_data_dependency*/false);
304   }
305   bool HasMemoryDependency(HInstruction* node, HInstruction* other) const;
306   bool HasExceptionDependency(const HInstruction* node, const HInstruction* other) const;
307   bool HasSideEffectDependency(HInstruction* node, HInstruction* other) const;
308   bool ArrayAccessMayAlias(HInstruction* node, HInstruction* other) const;
309   bool FieldAccessMayAlias(const HInstruction* node, const HInstruction* other) const;
310   size_t ArrayAccessHeapLocation(HInstruction* instruction) const;
311   size_t FieldAccessHeapLocation(HInstruction* obj, const FieldInfo* field) const;
312 
313   // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
314   void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
315 
316   const HScheduler* const scheduler_;
317 
318   ScopedArenaAllocator* const allocator_;
319 
320   bool contains_scheduling_barrier_;
321 
322   ScopedArenaHashMap<const HInstruction*, std::unique_ptr<SchedulingNode>> nodes_map_;
323 
324   const HeapLocationCollector* const heap_location_collector_;
325 };
326 
327 /*
328  * The visitors derived from this base class are used by schedulers to evaluate
329  * the latencies of `HInstruction`s.
330  */
331 class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
332  public:
333   // This class and its sub-classes will never be used to drive a visit of an
334   // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
335   // to pass a valid graph to `HGraphDelegateVisitor()`.
SchedulingLatencyVisitor()336   SchedulingLatencyVisitor()
337       : HGraphDelegateVisitor(nullptr),
338         last_visited_latency_(0),
339         last_visited_internal_latency_(0) {}
340 
VisitInstruction(HInstruction * instruction)341   void VisitInstruction(HInstruction* instruction) override {
342     LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
343         "Architecture-specific scheduling latency visitors must handle all instructions"
344         " (potentially by overriding the generic `VisitInstruction()`.";
345     UNREACHABLE();
346   }
347 
Visit(HInstruction * instruction)348   void Visit(HInstruction* instruction) {
349     instruction->Accept(this);
350   }
351 
CalculateLatency(SchedulingNode * node)352   void CalculateLatency(SchedulingNode* node) {
353     // By default nodes have no internal latency.
354     last_visited_internal_latency_ = 0;
355     Visit(node->GetInstruction());
356   }
357 
GetLastVisitedLatency()358   uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
GetLastVisitedInternalLatency()359   uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
360 
361  protected:
362   // The latency of the most recent visited SchedulingNode.
363   // This is for reporting the latency value to the user of this visitor.
364   uint32_t last_visited_latency_;
365   // This represents the time spent *within* the generated code for the most recent visited
366   // SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
367   uint32_t last_visited_internal_latency_;
368 };
369 
370 class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
371  public:
Reset()372   virtual void Reset() {}
373   virtual SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
374                                                  const SchedulingGraph& graph) = 0;
~SchedulingNodeSelector()375   virtual ~SchedulingNodeSelector() {}
376  protected:
DeleteNodeAtIndex(ScopedArenaVector<SchedulingNode * > * nodes,size_t index)377   static void DeleteNodeAtIndex(ScopedArenaVector<SchedulingNode*>* nodes, size_t index) {
378     (*nodes)[index] = nodes->back();
379     nodes->pop_back();
380   }
381 };
382 
383 /*
384  * Select a `SchedulingNode` at random within the candidates.
385  */
386 class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
387  public:
RandomSchedulingNodeSelector()388   RandomSchedulingNodeSelector() : seed_(0) {
389     seed_  = static_cast<uint32_t>(NanoTime());
390     srand(seed_);
391   }
392 
PopHighestPriorityNode(ScopedArenaVector<SchedulingNode * > * nodes,const SchedulingGraph & graph)393   SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
394                                          const SchedulingGraph& graph) override {
395     UNUSED(graph);
396     DCHECK(!nodes->empty());
397     size_t select = rand_r(&seed_) % nodes->size();
398     SchedulingNode* select_node = (*nodes)[select];
399     DeleteNodeAtIndex(nodes, select);
400     return select_node;
401   }
402 
403   uint32_t seed_;
404 };
405 
406 /*
407  * Select a `SchedulingNode` according to critical path information,
408  * with heuristics to favor certain instruction patterns like materialized condition.
409  */
410 class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
411  public:
CriticalPathSchedulingNodeSelector()412   CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
413 
Reset()414   void Reset() override { prev_select_ = nullptr; }
415   SchedulingNode* PopHighestPriorityNode(ScopedArenaVector<SchedulingNode*>* nodes,
416                                          const SchedulingGraph& graph) override;
417 
418  protected:
419   SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
420                                                   SchedulingNode* check) const;
421 
422   SchedulingNode* SelectMaterializedCondition(ScopedArenaVector<SchedulingNode*>* nodes,
423                                               const SchedulingGraph& graph) const;
424 
425  private:
426   const SchedulingNode* prev_select_;
427 };
428 
429 class HScheduler {
430  public:
HScheduler(SchedulingLatencyVisitor * latency_visitor,SchedulingNodeSelector * selector)431   HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
432       : latency_visitor_(latency_visitor),
433         selector_(selector),
434         only_optimize_loop_blocks_(true),
435         cursor_(nullptr) {}
~HScheduler()436   virtual ~HScheduler() {}
437 
438   void Schedule(HGraph* graph);
439 
SetOnlyOptimizeLoopBlocks(bool loop_only)440   void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
441 
442   // Instructions can not be rescheduled across a scheduling barrier.
443   virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
444 
445  protected:
446   void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
447   void Schedule(SchedulingNode* scheduling_node,
448                 /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
449   void Schedule(HInstruction* instruction);
450 
451   // Any instruction returning `false` via this method will prevent its
452   // containing basic block from being scheduled.
453   // This method is used to restrict scheduling to instructions that we know are
454   // safe to handle.
455   //
456   // For newly introduced instructions by default HScheduler::IsSchedulable returns false.
457   // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see
458   // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also
459   // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction.
460   virtual bool IsSchedulable(const HInstruction* instruction) const;
461   bool IsSchedulable(const HBasicBlock* block) const;
462 
CalculateLatency(SchedulingNode * node)463   void CalculateLatency(SchedulingNode* node) {
464     latency_visitor_->CalculateLatency(node);
465     node->SetLatency(latency_visitor_->GetLastVisitedLatency());
466     node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
467   }
468 
469   SchedulingLatencyVisitor* const latency_visitor_;
470   SchedulingNodeSelector* const selector_;
471   bool only_optimize_loop_blocks_;
472 
473   // A pointer indicating where the next instruction to be scheduled will be inserted.
474   HInstruction* cursor_;
475 
476  private:
477   DISALLOW_COPY_AND_ASSIGN(HScheduler);
478 };
479 
IsSchedulingBarrier(const HInstruction * instruction)480 inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
481   return scheduler_->IsSchedulingBarrier(instruction);
482 }
483 
484 class HInstructionScheduling : public HOptimization {
485  public:
486   HInstructionScheduling(HGraph* graph,
487                          InstructionSet instruction_set,
488                          CodeGenerator* cg = nullptr,
489                          const char* name = kInstructionSchedulingPassName)
HOptimization(graph,name)490       : HOptimization(graph, name),
491         codegen_(cg),
492         instruction_set_(instruction_set) {}
493 
Run()494   bool Run() override {
495     return Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false);
496   }
497 
498   bool Run(bool only_optimize_loop_blocks, bool schedule_randomly);
499 
500   static constexpr const char* kInstructionSchedulingPassName = "scheduler";
501 
502  private:
503   CodeGenerator* const codegen_;
504   const InstructionSet instruction_set_;
505   DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
506 };
507 
508 }  // namespace art
509 
510 #endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_H_
511