1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
17 
18 #include <algorithm>
19 #include <unordered_set>
20 
21 #include "absl/memory/memory.h"
22 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
23 #include "tensorflow/compiler/xla/service/hlo_computation.h"
24 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
25 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
26 #include "tensorflow/compiler/xla/test_helpers.h"
27 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
28 #include "tensorflow/compiler/xla/tests/test_utils.h"
29 #include "tensorflow/compiler/xla/types.h"
30 
31 namespace xla {
32 namespace gpu {
33 
34 class GpuHloScheduleTest : public HloTestBase {
35  protected:
36   using HloVec = std::vector<HloInstruction*>;
37 
38   // Pre-canned shapes.
39   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
40 
BuildGpuHloSchedule(const HloModule & module,const StreamAssignment & streams)41   static std::unique_ptr<GpuHloSchedule> BuildGpuHloSchedule(
42       const HloModule& module, const StreamAssignment& streams) {
43     return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8)
44         .ConsumeValueOrDie();
45   }
46 
CreateNewVerifiedModule()47   std::unique_ptr<HloModule> CreateNewVerifiedModule() {
48     HloModuleConfig config;
49     auto debug_options = GetDebugOptionsForTest();
50     debug_options.set_xla_gpu_disable_multi_streaming(false);
51     config.set_debug_options(debug_options);
52     return absl::make_unique<HloModule>("test_module", config);
53   }
54 
RemoveHlo(const HloVec & input,const std::unordered_set<const HloInstruction * > & remove)55   HloVec RemoveHlo(const HloVec& input,
56                    const std::unordered_set<const HloInstruction*>& remove) {
57     HloVec result(input);
58     result.erase(std::remove_if(result.begin(), result.end(),
59                                 [&remove](const HloInstruction* x) {
60                                   return remove.count(x) > 0;
61                                 }),
62                  result.end());
63     return result;
64   }
65 };
66 
67 // Test of a single stream, where data dependencies fully determine the
68 // execution order.
TEST_F(GpuHloScheduleTest,SequentialMatMul)69 TEST_F(GpuHloScheduleTest, SequentialMatMul) {
70   HloComputation::Builder builder("entry_computation");
71   HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
72       /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
73   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
74       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
75   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
76       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
77   HloInstruction* dot1 =
78       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, x, y));
79   HloInstruction* dot2 =
80       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
81 
82   auto module = CreateNewVerifiedModule();
83   module->AddEntryComputation(builder.Build(dot2));
84 
85   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
86   EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
87             streams->StreamNumberForHlo(*dot2));
88 
89   auto schedule = BuildGpuHloSchedule(*module, *streams);
90   // Remove parameters, which are unordered.
91   EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
92             HloVec({dot1, dot2}));
93 
94   // Parameters x,y,z are mutually unordered, while dot1 and dot2 are
95   // transitively ordered by operands.
96   auto order = schedule->ConsumeHloOrdering();
97   EXPECT_TRUE(order->ExecutesBefore(x, dot1));
98   EXPECT_TRUE(order->ExecutesBefore(x, dot2));
99   EXPECT_TRUE(order->ExecutesBefore(y, dot1));
100   EXPECT_TRUE(order->ExecutesBefore(y, dot2));
101   EXPECT_TRUE(order->ExecutesBefore(z, dot2));
102   EXPECT_TRUE(order->ExecutesBefore(dot1, dot2));
103 
104   EXPECT_FALSE(order->ExecutesBefore(x, x));
105   EXPECT_FALSE(order->ExecutesBefore(x, y));
106   EXPECT_FALSE(order->ExecutesBefore(x, z));
107   EXPECT_FALSE(order->ExecutesBefore(y, x));
108   EXPECT_FALSE(order->ExecutesBefore(y, y));
109   EXPECT_FALSE(order->ExecutesBefore(y, z));
110   EXPECT_FALSE(order->ExecutesBefore(z, x));
111   EXPECT_FALSE(order->ExecutesBefore(z, y));
112   EXPECT_FALSE(order->ExecutesBefore(z, z));
113   EXPECT_FALSE(order->ExecutesBefore(z, dot1));
114   EXPECT_FALSE(order->ExecutesBefore(dot1, x));
115   EXPECT_FALSE(order->ExecutesBefore(dot1, y));
116   EXPECT_FALSE(order->ExecutesBefore(dot1, z));
117   EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
118   EXPECT_FALSE(order->ExecutesBefore(dot2, x));
119   EXPECT_FALSE(order->ExecutesBefore(dot2, y));
120   EXPECT_FALSE(order->ExecutesBefore(dot2, z));
121   EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
122   EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
123 }
124 
125 // Test of a single stream, where data dependencies do not fully determine the
126 // execution order, but the stream assignment does.
TEST_F(GpuHloScheduleTest,SequentialAdd)127 TEST_F(GpuHloScheduleTest, SequentialAdd) {
128   HloComputation::Builder builder("entry_computation");
129   HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
130       /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
131   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
132       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
133   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
134       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
135   HloInstruction* add1 = builder.AddInstruction(
136       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, x, y));
137   HloInstruction* add2 = builder.AddInstruction(
138       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, y, z));
139   HloInstruction* add3 = builder.AddInstruction(
140       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
141 
142   auto module = CreateNewVerifiedModule();
143   module->AddEntryComputation(builder.Build(add3));
144 
145   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
146   EXPECT_EQ(streams->StreamNumberForHlo(*add1),
147             streams->StreamNumberForHlo(*add2));
148   EXPECT_EQ(streams->StreamNumberForHlo(*add1),
149             streams->StreamNumberForHlo(*add3));
150 
151   auto schedule = BuildGpuHloSchedule(*module, *streams);
152   // Remove parameters, which are unordered.
153   EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
154             HloVec({add1, add2, add3}));
155 
156   // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
157   // transitively ordered by operands.
158   auto order = schedule->ConsumeHloOrdering();
159   EXPECT_TRUE(order->ExecutesBefore(x, add1));
160   EXPECT_TRUE(order->ExecutesBefore(x, add2));
161   EXPECT_TRUE(order->ExecutesBefore(x, add3));
162   EXPECT_TRUE(order->ExecutesBefore(y, add1));
163   EXPECT_TRUE(order->ExecutesBefore(y, add2));
164   EXPECT_TRUE(order->ExecutesBefore(y, add3));
165   EXPECT_TRUE(order->ExecutesBefore(z, add2));
166   EXPECT_TRUE(order->ExecutesBefore(z, add3));
167   EXPECT_TRUE(order->ExecutesBefore(add1, add3));
168   EXPECT_TRUE(order->ExecutesBefore(add2, add3));
169   // The HLO graph does not define an ordering for add1 and add2, but their
170   // assignment onto the same stream does define an ordering.
171   if (order->ExecutesBefore(add1, add2)) {
172     EXPECT_FALSE(order->ExecutesBefore(add2, add1));
173   } else {
174     EXPECT_TRUE(order->ExecutesBefore(add2, add1));
175     EXPECT_FALSE(order->ExecutesBefore(add1, add2));
176   }
177 
178   EXPECT_FALSE(order->ExecutesBefore(x, x));
179   EXPECT_FALSE(order->ExecutesBefore(x, y));
180   EXPECT_FALSE(order->ExecutesBefore(x, z));
181   EXPECT_FALSE(order->ExecutesBefore(y, x));
182   EXPECT_FALSE(order->ExecutesBefore(y, y));
183   EXPECT_FALSE(order->ExecutesBefore(y, z));
184   EXPECT_FALSE(order->ExecutesBefore(z, x));
185   EXPECT_FALSE(order->ExecutesBefore(z, y));
186   EXPECT_FALSE(order->ExecutesBefore(z, z));
187   EXPECT_FALSE(order->ExecutesBefore(z, add1));
188   EXPECT_FALSE(order->ExecutesBefore(add1, x));
189   EXPECT_FALSE(order->ExecutesBefore(add1, y));
190   EXPECT_FALSE(order->ExecutesBefore(add1, z));
191   EXPECT_FALSE(order->ExecutesBefore(add1, add1));
192   EXPECT_FALSE(order->ExecutesBefore(add2, x));
193   EXPECT_FALSE(order->ExecutesBefore(add2, y));
194   EXPECT_FALSE(order->ExecutesBefore(add2, z));
195   EXPECT_FALSE(order->ExecutesBefore(add2, add2));
196 }
197 
198 // Test of two streams.
TEST_F(GpuHloScheduleTest,ConcurrentMatMul)199 TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
200   HloComputation::Builder builder("entry_computation");
201   HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
202       /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
203   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
204       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
205   HloInstruction* dot1 =
206       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, x, y));
207   HloInstruction* dot2 =
208       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, y, x));
209   HloInstruction* add =
210       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
211 
212   auto module = CreateNewVerifiedModule();
213   module->AddEntryComputation(builder.Build(add));
214 
215   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
216   EXPECT_NE(streams->StreamNumberForHlo(*dot1),
217             streams->StreamNumberForHlo(*dot2));
218 
219   auto schedule = BuildGpuHloSchedule(*module, *streams);
220   // Remove parameters, which are unordered.
221   HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
222   EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
223               thunk_launch_order == HloVec({dot2, dot1, add}));
224 
225   // Parameters x,y are mutually unordered, while dot1, dot2 and add are
226   // transitively ordered by operands.
227   auto order = schedule->ConsumeHloOrdering();
228   EXPECT_TRUE(order->ExecutesBefore(x, dot1));
229   EXPECT_TRUE(order->ExecutesBefore(x, dot2));
230   EXPECT_TRUE(order->ExecutesBefore(y, dot1));
231   EXPECT_TRUE(order->ExecutesBefore(y, dot2));
232   EXPECT_TRUE(order->ExecutesBefore(dot1, add));
233   EXPECT_TRUE(order->ExecutesBefore(dot2, add));
234 
235   EXPECT_FALSE(order->ExecutesBefore(x, x));
236   EXPECT_FALSE(order->ExecutesBefore(x, y));
237   EXPECT_FALSE(order->ExecutesBefore(y, x));
238   EXPECT_FALSE(order->ExecutesBefore(y, y));
239   EXPECT_FALSE(order->ExecutesBefore(dot1, x));
240   EXPECT_FALSE(order->ExecutesBefore(dot1, y));
241   EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
242   EXPECT_FALSE(order->ExecutesBefore(dot1, dot2));
243   EXPECT_FALSE(order->ExecutesBefore(dot2, x));
244   EXPECT_FALSE(order->ExecutesBefore(dot2, y));
245   EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
246   EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
247   EXPECT_FALSE(order->ExecutesBefore(add, x));
248   EXPECT_FALSE(order->ExecutesBefore(add, y));
249   EXPECT_FALSE(order->ExecutesBefore(add, dot1));
250   EXPECT_FALSE(order->ExecutesBefore(add, dot2));
251   EXPECT_FALSE(order->ExecutesBefore(add, add));
252 }
253 
254 // Test of multiple streams.
TEST_F(GpuHloScheduleTest,LatticeMatMul)255 TEST_F(GpuHloScheduleTest, LatticeMatMul) {
256   //      d00      -- layer 0
257   //     /   \
258   //   d10   d11   -- layer 1
259   //  /   \ /   \
260   // d20  d21  d22 -- layer 2
261   //  \   / \   /
262   //   d30   d31   -- layer 3
263   //     \   /
264   //      d40      -- layer 4
265   HloComputation::Builder builder("entry_computation");
266   std::vector<HloInstruction*> params;
267   params.reserve(6);
268   for (int i = 0; i < 6; ++i) {
269     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
270         i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i))));
271   }
272   HloInstruction* d00 = builder.AddInstruction(
273       CreateCanonicalDot(f32_2x2_, params[2], params[3]));
274   HloInstruction* d10 =
275       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, params[1], d00));
276   HloInstruction* d11 =
277       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d00, params[4]));
278   HloInstruction* d20 =
279       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, params[0], d10));
280   HloInstruction* d21 =
281       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d10, d11));
282   HloInstruction* d22 =
283       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d11, params[5]));
284   HloInstruction* d30 =
285       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d20, d21));
286   HloInstruction* d31 =
287       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d21, d22));
288   HloInstruction* d40 =
289       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
290 
291   auto module = CreateNewVerifiedModule();
292   module->AddEntryComputation(builder.Build(d40));
293 
294   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
295   // The two dots on layer 1 are concurrent.
296   EXPECT_NE(streams->StreamNumberForHlo(*d10),
297             streams->StreamNumberForHlo(*d11));
298   // The three dots on layer 2 are concurrent.
299   EXPECT_NE(streams->StreamNumberForHlo(*d20),
300             streams->StreamNumberForHlo(*d21));
301   EXPECT_NE(streams->StreamNumberForHlo(*d20),
302             streams->StreamNumberForHlo(*d22));
303   EXPECT_NE(streams->StreamNumberForHlo(*d21),
304             streams->StreamNumberForHlo(*d22));
305   // The two dots on layer 3 are concurrent.
306   EXPECT_NE(streams->StreamNumberForHlo(*d30),
307             streams->StreamNumberForHlo(*d31));
308 
309   // We don't check the thunk launch order, since there are many valid total
310   // orders, and it's annoying to express.
311   auto schedule = BuildGpuHloSchedule(*module, *streams);
312 
313   auto order = schedule->ConsumeHloOrdering();
314   const HloVec all_params(
315       {params[0], params[1], params[2], params[3], params[4], params[5]});
316   const HloVec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40});
317 
318   // Parameters are mutually unordered, and never execute before ops.
319   for (const HloInstruction* param : all_params) {
320     for (const HloInstruction* param2 : all_params) {
321       EXPECT_FALSE(order->ExecutesBefore(param, param2));
322     }
323     for (const HloInstruction* op : all_ops) {
324       EXPECT_FALSE(order->ExecutesBefore(op, param));
325     }
326   }
327 
328   // Check ordering of params before ops.
329   for (const HloInstruction* op : all_ops) {
330     if (op == d20 || op == d30 || op == d40) {
331       EXPECT_TRUE(order->ExecutesBefore(params[0], op));
332     } else {
333       EXPECT_FALSE(order->ExecutesBefore(params[0], op));
334     }
335     if (op != d00 && op != d11 && op != d22) {
336       EXPECT_TRUE(order->ExecutesBefore(params[1], op));
337     } else {
338       EXPECT_FALSE(order->ExecutesBefore(params[1], op));
339     }
340     EXPECT_TRUE(order->ExecutesBefore(params[2], op));
341     EXPECT_TRUE(order->ExecutesBefore(params[3], op));
342     if (op != d00 && op != d10 && op != d20) {
343       EXPECT_TRUE(order->ExecutesBefore(params[4], op));
344     } else {
345       EXPECT_FALSE(order->ExecutesBefore(params[4], op));
346     }
347     if (op == d22 || op == d31 || op == d40) {
348       EXPECT_TRUE(order->ExecutesBefore(params[5], op));
349     } else {
350       EXPECT_FALSE(order->ExecutesBefore(params[5], op));
351     }
352   }
353 
354   // Check ordering of ops before ops.
355   for (const HloInstruction* op : all_ops) {
356     if (op != d00) {
357       EXPECT_TRUE(order->ExecutesBefore(d00, op));
358     } else {
359       EXPECT_FALSE(order->ExecutesBefore(d00, op));
360     }
361 
362     if (op == d20 || op == d21 || op == d30 || op == d31 || op == d40) {
363       EXPECT_TRUE(order->ExecutesBefore(d10, op));
364     } else {
365       EXPECT_FALSE(order->ExecutesBefore(d10, op));
366     }
367 
368     if (op == d21 || op == d22 || op == d30 || op == d31 || op == d40) {
369       EXPECT_TRUE(order->ExecutesBefore(d11, op));
370     } else {
371       EXPECT_FALSE(order->ExecutesBefore(d11, op));
372     }
373 
374     if (op == d30 || op == d40) {
375       EXPECT_TRUE(order->ExecutesBefore(d20, op));
376     } else {
377       EXPECT_FALSE(order->ExecutesBefore(d20, op));
378     }
379 
380     if (op == d30 || op == d31 || op == d40) {
381       EXPECT_TRUE(order->ExecutesBefore(d21, op));
382     } else {
383       EXPECT_FALSE(order->ExecutesBefore(d21, op));
384     }
385 
386     if (op == d31 || op == d40) {
387       EXPECT_TRUE(order->ExecutesBefore(d22, op));
388     } else {
389       EXPECT_FALSE(order->ExecutesBefore(d22, op));
390     }
391 
392     if (op == d40) {
393       EXPECT_TRUE(order->ExecutesBefore(d30, op));
394       EXPECT_TRUE(order->ExecutesBefore(d31, op));
395     } else {
396       EXPECT_FALSE(order->ExecutesBefore(d30, op));
397       EXPECT_FALSE(order->ExecutesBefore(d31, op));
398     }
399 
400     EXPECT_FALSE(order->ExecutesBefore(d40, op));
401   }
402 }
403 
404 }  // namespace gpu
405 }  // namespace xla
406