1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
17 
18 #include <memory>
19 #include <stack>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/service/hlo_computation.h"
25 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
26 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
27 #include "tensorflow/compiler/xla/service/stream_pool.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
30 #include "tensorflow/core/util/ptr_util.h"
31 
32 namespace xla {
33 namespace gpu {
34 namespace {
InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>> * timers,se::Stream * stream)35 void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
36                        se::Stream* stream) {
37   timers->push(absl::make_unique<se::Timer>(stream->parent()));
38   stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
39 }
40 
GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>> * timers,const std::vector<StreamPool::Ptr> & sub_streams,se::Stream * stream,double clock_rate_ghz)41 uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,
42                       const std::vector<StreamPool::Ptr>& sub_streams,
43                       se::Stream* stream, double clock_rate_ghz) {
44   CHECK_GT(timers->size(), 0);
45   stream->ThenWaitFor(&sub_streams);
46   stream->ThenStopTimer(timers->top().get());
47   stream->BlockHostUntilDone().IgnoreError();
48   double nanoseconds = timers->top()->Nanoseconds();
49   timers->pop();
50   return static_cast<uint64>(nanoseconds * clock_rate_ghz);
51 }
52 }  // namespace
53 
HloExecutionProfiler(bool do_profile,HloExecutionProfile * profile,se::Stream * stream,const std::vector<StreamPool::Ptr> & sub_streams,const HloComputation * computation)54 HloExecutionProfiler::HloExecutionProfiler(
55     bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
56     const std::vector<StreamPool::Ptr>& sub_streams,
57     const HloComputation* computation)
58     : do_profile_(do_profile),
59       profile_(profile),
60       stream_(stream),
61       sub_streams_(sub_streams),
62       computation_(computation) {
63   if (do_profile_) {
64     clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz();
65     InitAndStartTimer(&timers_, stream);
66   }
67 }
68 
FinishExecution()69 void HloExecutionProfiler::FinishExecution() {
70   CHECK(!finished_execution_) << "Call FinishExecution only once!";
71   finished_execution_ = true;
72   if (do_profile_) {
73     profile_->set_total_cycles_executed(
74         *computation_,
75         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
76   }
77 }
78 
StartHloComputation()79 void HloExecutionProfiler::StartHloComputation() {
80   if (do_profile_) {
81     InitAndStartTimer(&timers_, stream_);
82   }
83 }
84 
FinishHloComputation(const HloComputation * computation)85 void HloExecutionProfiler::FinishHloComputation(
86     const HloComputation* computation) {
87   if (do_profile_) {
88     profile_->set_total_cycles_executed(
89         *computation,
90         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
91   }
92 }
93 
StartHloInstruction()94 void HloExecutionProfiler::StartHloInstruction() {
95   if (do_profile_) {
96     InitAndStartTimer(&timers_, stream_);
97   }
98 }
99 
FinishHloInstruction(const HloInstruction * hlo_instruction)100 void HloExecutionProfiler::FinishHloInstruction(
101     const HloInstruction* hlo_instruction) {
102   if (do_profile_) {
103     hlo_instructions_.erase(hlo_instruction);
104     profile_->SetCyclesTakenBy(
105         hlo_instruction,
106         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
107   }
108 }
109 
110 std::unique_ptr<ScopedInstructionProfiler>
MakeScopedInstructionProfiler(const HloInstruction * hlo_instruction)111 HloExecutionProfiler::MakeScopedInstructionProfiler(
112     const HloInstruction* hlo_instruction) {
113   if (do_profile_ && hlo_instruction != nullptr) {
114     // Make sure that we are not already measuring the time for the same
115     // 'hlo_instruction'.
116     CHECK(hlo_instructions_.insert(hlo_instruction).second)
117         << hlo_instruction->name();
118   }
119   return absl::make_unique<ScopedInstructionProfiler>(this, hlo_instruction);
120 }
121 
122 }  // namespace gpu
123 }  // namespace xla
124