1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <deque>
17 #include <memory>
18 #include <unordered_map>
19 
20 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
21 
22 #include "absl/memory/memory.h"
23 #include "tensorflow/compiler/xla/service/buffer_value.h"
24 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
25 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
26 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
27 #include "tensorflow/compiler/xla/types.h"
28 
29 namespace xla {
30 namespace gpu {
31 
32 namespace {
33 
34 // An HLO partial ordering based on the actual stream assignment and thunk
35 // launch order.
36 class GpuHloOrdering : public PredecessorHloOrdering {
37  public:
38   GpuHloOrdering(const HloModule* module,
39                  const StreamAssignment& stream_assignment,
40                  const std::vector<HloInstruction*>& thunk_launch_order);
41   ~GpuHloOrdering() override = default;
42 
43   // Only the entry computation can possibly be sequentially ordered, and only
44   // if we've assigned all instructions to a single stream.
SequentialOrder(const HloComputation & computation) const45   const HloInstructionSequence* SequentialOrder(
46       const HloComputation& computation) const override {
47     return &computation == module_->entry_computation() ? entry_sequence_.get()
48                                                         : nullptr;
49   }
50 
ToString() const51   string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
52 
53  private:
54   std::unique_ptr<HloInstructionSequence> entry_sequence_;
55 };
56 
GpuHloOrdering(const HloModule * module,const StreamAssignment & stream_assignment,const std::vector<HloInstruction * > & thunk_launch_order)57 GpuHloOrdering::GpuHloOrdering(
58     const HloModule* module, const StreamAssignment& stream_assignment,
59     const std::vector<HloInstruction*>& thunk_launch_order)
60     : PredecessorHloOrdering(module) {
61   // The entry computation has a total order when there's only one stream.
62   if (stream_assignment.StreamCount() == 1) {
63     entry_sequence_ =
64         absl::make_unique<HloInstructionSequence>(thunk_launch_order);
65   }
66 
67   // The ordering of instructions for the entry computation is determined by the
68   // total order of thunk launches, and stream assignment. Instructions are
69   // sequential within a stream and concurrent across streams. In addition, the
70   // GpuExecutable adds cross-stream dependency edges to ensure each instruction
71   // waits for its operands before executing.
72   //
73   // The predecessor map is built incrementally, in thunk launch order. We
74   // record the most-recently seen instructions per stream in
75   // 'last_instruction_per_stream'. This lets us quickly determine the
76   // same-stream predecessors of each instruction.
77 
78   // Compute the set of all instructions we will want to set reachability on.
79   auto predecessor_map = absl::make_unique<HloReachabilityMap>(
80       module->entry_computation()->MakeInstructionPostOrder());
81 
82   // The most recently visited instruction per stream.
83   std::vector<const HloInstruction*> last_instruction_per_stream(
84       stream_assignment.StreamCount(), nullptr);
85 
86   for (const HloInstruction* hlo : thunk_launch_order) {
87     predecessor_map->SetReachable(hlo, hlo);
88     if (stream_assignment.HasStreamAssigned(*hlo)) {
89       // Gather all instruction which are immediate predecessors of 'hlo' in the
90       // reachability graph.
91       std::vector<const HloInstruction*> immediate_preds;
92       immediate_preds.insert(immediate_preds.end(), hlo->operands().begin(),
93                              hlo->operands().end());
94       immediate_preds.insert(immediate_preds.end(),
95                              hlo->control_predecessors().begin(),
96                              hlo->control_predecessors().end());
97 
98       // All ops already queued on the same instruction stream, and their
99       // transitive predecessors, are predecessors.
100       const int stream_no = stream_assignment.StreamNumberForHlo(*hlo);
101       if (last_instruction_per_stream[stream_no] != nullptr) {
102         immediate_preds.push_back(last_instruction_per_stream[stream_no]);
103       }
104       predecessor_map->FastSetReachabilityToUnion(immediate_preds, hlo);
105       last_instruction_per_stream[stream_no] = hlo;
106     } else {
107       // Only parameters and constants don't have an assigned stream, since they
108       // don't require a thunk. These ops don't have any predecessors.
109       CHECK(hlo->opcode() == HloOpcode::kParameter ||
110             hlo->opcode() == HloOpcode::kConstant);
111       CHECK_EQ(hlo->operand_count(), 0);
112     }
113   }
114   predecessors_.emplace(module->entry_computation(),
115                         std::move(predecessor_map));
116 
117   // The ordering of instructions in subcomputations is based solely on control
118   // and data dependencies.
119   //
120   // TODO(toddw): Each subcomputation is actually emitted as a function in DFS
121   // postorder, so we can do better and establish the total order here. We don't
122   // do that yet since it's hard to ensure that the order here is the order used
123   // by IrEmitterNested. And mismatched ordering bugs would be hard to find.
124   for (auto* computation : module->computations()) {
125     if (computation != module->entry_computation() &&
126         !computation->IsFusionComputation()) {
127       predecessors_.emplace(computation,
128                             HloReachabilityMap::Build(computation));
129     }
130   }
131 }
132 
133 // Computes a topological launch_order that is close to a breadth-first
134 // order. This heuristic works well for graphs where concurrent kernels are
135 // located at the same layer. It can often reduce dependency between concurrent
136 // GEMMs due to intra-stream total orders.  E.g. consider the following HLO
137 // graph where the numbers in the parens indicate the stream assigned to each
138 // HLO.
139 //
140 //   A(0) -> D(0) -> E(1)
141 //    |
142 //    v
143 //   B(0)
144 //    |
145 //    v
146 //   C(0)
147 //
148 // If the total order is A,B,C,D,E, then C and E would be sequentialized
149 // because C completes before D starts in stream 0, and E depends on D.
150 // However, if the total order is A,B,D,C,E, then C and E can run
151 // concurrently.
BFSLaunchOrder(const HloComputation * computation,std::vector<HloInstruction * > * launch_order)152 void BFSLaunchOrder(const HloComputation* computation,
153                     std::vector<HloInstruction*>* launch_order) {
154   // This topological sort uses two data structures:
155   // 1. `incoming_edge_count` which keeps track of the number of incoming
156   // edges to each HLO;
157   // 2. `queue` which contains all HLOs with no incoming edges.
158   //
159   // The sorting algorithm repeatedly pops the top from the queue and deletes
160   // that HLO from the graph, making more HLOs incoming-edge free.
161   std::deque<HloInstruction*> queue;
162   std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
163   for (auto* hlo : computation->instructions()) {
164     if (hlo->operand_count() == 0) {
165       queue.push_back(hlo);
166     } else {
167       incoming_edge_count[hlo] =
168           std::set<HloInstruction*>(hlo->operands().begin(),
169                                     hlo->operands().end())
170               .size();
171     }
172   }
173 
174   while (!queue.empty()) {
175     HloInstruction* x = queue.front();
176     queue.pop_front();
177     launch_order->push_back(x);
178     for (HloInstruction* y : x->users()) {
179       --incoming_edge_count[y];
180       if (incoming_edge_count[y] == 0) {
181         queue.push_back(y);
182       }
183     }
184   }
185 }
186 
187 }  // end namespace
188 
GpuHloSchedule()189 GpuHloSchedule::GpuHloSchedule() {}
190 
191 /* static */
Build(const HloModule & module,const StreamAssignment & stream_assignment,int64 pointer_size)192 StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
193     const HloModule& module, const StreamAssignment& stream_assignment,
194     int64 pointer_size) {
195   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
196 
197   // Initialize thunk_launch_order_, the total order of thunk launches.
198   HloComputation* entry_computation = module.entry_computation();
199   if (stream_assignment.StreamCount() == 1) {
200     // All kernels are launched on a single stream, so there's no loss of
201     // concurrency by optimizing for minimal memory usage.
202     TF_ASSIGN_OR_RETURN(
203         HloInstructionSequence sequence,
204         ScheduleComputation(
205             entry_computation, [pointer_size](const BufferValue& buffer) {
206               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
207             }));
208     schedule->thunk_launch_order_ = sequence.instructions();
209   } else {
210     // BFS tends to increase concurrency, but also increases memory usage.
211     BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
212   }
213 
214   schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
215       &module, stream_assignment, schedule->thunk_launch_order_);
216 
217   return std::move(schedule);
218 }
219 
220 }  // namespace gpu
221 }  // namespace xla
222