1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23 return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26 return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
27 }
IsCanonicalDevice(const string & device)28 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
29
30 // Notes about start and end time from the NodeExecStats proto:
31 // For GPU, there is no difference between op_end_rel_micros and
32 // all_end_rel_micros. All are kernel times.
33 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
34 // some post-processing. Besides, currently, there is no way to measure
35 // the execution time of async ops accurately.
36 //
37 // Notes about device:
38 // For ops on gpu:
39 // It will appear in three different devices in RunMetadata: 1) gpu:x,
40 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
41 // of all different 3). 1) is the op scheduling, pre-processing and
42 // post processing time. 3) is the execution time of GPU kernels on a stream.
43 // For ops on cpu:
44 // It will only appear as cpu:0.
45
AddTimeStats(const string & dev,const NodeExecStats & step_stat)46 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
47 devices_.insert(dev);
48 if (step_stat.all_start_micros() > 0) {
49 if (exec_.all_start_micros() > 0) {
50 exec_.set_all_start_micros(
51 std::min(static_cast<int64>(exec_.all_start_micros()),
52 static_cast<int64>(step_stat.all_start_micros())));
53 } else {
54 exec_.set_all_start_micros(step_stat.all_start_micros());
55 }
56 int64 op_end_rel_micros = step_stat.op_end_rel_micros();
57 // Round quick execution to 1 micro to be semantically robust.
58 if (op_end_rel_micros == 0) {
59 ++op_end_rel_micros;
60 }
61 exec_.set_latest_end_micros(
62 std::max(static_cast<int64>(exec_.latest_end_micros()),
63 step_stat.all_start_micros() + op_end_rel_micros));
64
65 const std::pair<int64, int64> pair =
66 std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
67 if (CountAsAcceleratorTime(dev)) {
68 accelerator_execs_[dev].push_back(pair);
69 op_execs_[dev].push_back(pair);
70 } else if (CountAsCPUTime(dev)) {
71 cpu_execs_[dev].push_back(pair);
72 op_execs_[dev].push_back(pair);
73 // In while-loop, a graph node is executed multiple times under
74 // the same name.
75 exec_.set_run_count(exec_.run_count() + 1);
76 }
77 }
78 }
79
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)80 void ExecStep::AddMemoryStats(const string& dev,
81 const NodeExecStats& step_stat) {
82 ExecMemory exec_mem;
83 if (step_stat.all_start_micros() > 0) {
84 exec_mem.set_memory_micros(step_stat.all_start_micros() +
85 step_stat.op_end_rel_micros());
86 } else {
87 absl::FPrintF(stderr, "%s has no start time, skipping\n",
88 step_stat.node_name());
89 return;
90 }
91
92 int accelerator_allocator_cnt = 0;
93 for (const auto& mem : step_stat.memory()) {
94 // TODO(xpan): Fix this hack. Currently the allocator name seems quite
95 // ad-hoc.
96 if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
97 continue;
98 }
99 ++accelerator_allocator_cnt;
100 exec_mem.set_allocator_bytes_in_use(
101 std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
102 static_cast<int64>(mem.allocator_bytes_in_use())));
103 for (const auto& alloc : mem.allocation_records()) {
104 allocations_.push_back(alloc);
105 }
106 }
107 if (accelerator_allocator_cnt > 1) {
108 absl::FPrintF(stderr, "found %d gpu allocator for 1 node\n",
109 accelerator_allocator_cnt);
110 }
111
112 int64 total_output_bytes = 0;
113 for (const auto& output : step_stat.output()) {
114 if (output.has_tensor_description() &&
115 output.tensor_description().has_allocation_description()) {
116 // TODO(xpan): Maybe allocated_bytes.
117 int64 output_bytes = std::max(output.tensor_description()
118 .allocation_description()
119 .allocated_bytes(),
120 output.tensor_description()
121 .allocation_description()
122 .requested_bytes());
123 uint64 output_ptr =
124 output.tensor_description().allocation_description().ptr();
125 total_output_bytes += output_bytes;
126
127 auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
128 mem.set_ptr(output_ptr);
129 mem.set_bytes(output_bytes);
130 }
131 }
132 exec_mem.set_output_bytes(total_output_bytes);
133
134 if (step_stat.has_memory_stats()) {
135 if (IsPlacedOnCPU(dev)) {
136 // Currently we assume ops placed on gpu only allocate memory on gpu.
137 exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
138 step_stat.memory_stats().temp_memory_size());
139 exec_mem.set_host_persistent_bytes(
140 exec_mem.host_persistent_bytes() +
141 step_stat.memory_stats().persistent_memory_size());
142 } else {
143 exec_mem.set_accelerator_temp_bytes(
144 exec_mem.accelerator_temp_bytes() +
145 step_stat.memory_stats().temp_memory_size());
146 exec_mem.set_accelerator_persistent_bytes(
147 exec_mem.accelerator_persistent_bytes() +
148 step_stat.memory_stats().persistent_memory_size());
149 }
150 }
151
152 // TODO(xpan): Make this more accurate:
153 // High level: Memory tracking is suspicious and requires large scale
154 // clean up.
155 // Investigate the memory usage difference between CPU/GPU with OpViewTest.
156 //
157 // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
158 // discuss OpKernelContext-related allocations.
159 // 2. allocate_output calls allocate_tensor, which is properly tracked in
160 // 'NodeExecStats.memory'.
161 // 3. allocate_temp is only tracked through record_xxx_temp. It appears
162 // in 'NodeExecStats.memory_stats'.
163 // 4. allocate_persistent calls allocate_tensor, which is properly tracked
164 // in 'NodeExecStats.memory'. However, there is no way to count it as
165 // persistent now.
166 // 5. record_xxx_persistent is called when allocate_persistent
167 // is not used and hence tracks some complementary bytes. It appears in
168 // 'NodeExecStats.memory_stats'. It's suspicious. But we should
169 // use it now since it covers constant op.
170 int64 residual_bytes = 0;
171 int64 requested_bytes = 0;
172 int64 peak_bytes = 0;
173 for (const auto& mem : step_stat.memory()) {
174 residual_bytes += mem.live_bytes();
175 requested_bytes += mem.total_bytes();
176 peak_bytes += mem.peak_bytes();
177 }
178 residual_bytes += exec_mem.host_persistent_bytes() +
179 exec_mem.accelerator_persistent_bytes();
180 requested_bytes += exec_mem.host_persistent_bytes() +
181 exec_mem.accelerator_persistent_bytes() +
182 exec_mem.host_temp_bytes() +
183 exec_mem.accelerator_temp_bytes();
184 peak_bytes += exec_mem.host_persistent_bytes() +
185 exec_mem.accelerator_persistent_bytes() +
186 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
187
188 exec_mem.set_requested_bytes(requested_bytes);
189 exec_mem.set_residual_bytes(residual_bytes);
190 exec_mem.set_peak_bytes(peak_bytes);
191 memory_execs_.emplace_back(exec_mem);
192 }
193
AddStepStat(int64 step,const string & device,const NodeExecStats & step_stat)194 void TFGraphNode::AddStepStat(int64 step, const string& device,
195 const NodeExecStats& step_stat) {
196 string dev = absl::AsciiStrToLower(device);
197
198 // TODO(xpan): Make this more robust?
199 // See run_metadata_test.py
200 // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
201 // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
202 if (IsCanonicalDevice(dev)) {
203 if (!node_.canonical_device().empty()) {
204 if (node_.canonical_device() != dev) {
205 // TODO(xpan): Some RunMetadata node appears at multiple devices.
206 // Need to address it.
207 return;
208 }
209 } else {
210 node_.set_canonical_device(dev);
211 // TODO(xpan): Support things other than gpu?
212 node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
213 AddOpType(node_.canonical_device());
214 }
215 }
216
217 auto exec = execs_.find(step);
218 if (exec == execs_.end()) {
219 execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
220 exec = execs_.find(step);
221 }
222
223 exec->second.AddTimeStats(dev, step_stat);
224
225 if (dev == node_.canonical_device()) {
226 exec->second.AddMemoryStats(dev, step_stat);
227 }
228 }
229
exec_micros() const230 int64 ExecStep::exec_micros() const {
231 return accelerator_exec_micros() + cpu_exec_micros();
232 }
233
accelerator_exec_micros() const234 int64 ExecStep::accelerator_exec_micros() const {
235 int64 total = 0;
236 // Normally, an op should only be scheduled on 1 accelerator device.
237 // Hence there should generally be 1 element in accelerator_execs_.
238 for (const auto& execs : accelerator_execs_) {
239 // An op can fire multiple kernels or
240 // being scheduled multiple times in while-loop.
241 for (const auto& exec : execs.second) {
242 total += exec.second;
243 }
244 }
245 return total;
246 }
247
cpu_exec_micros() const248 int64 ExecStep::cpu_exec_micros() const {
249 int64 total = 0;
250 // Normally, an op can only be scheduled on 1 device.
251 for (const auto& execs : cpu_execs_) {
252 // An op can be scheduled multiple times in while-loop.
253 for (const auto& exec : execs.second) {
254 total += exec.second;
255 }
256 }
257 return total;
258 }
259
ShapeProtoToVec(const TensorShapeProto & shape_pb)260 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
261 std::vector<int64> shape_vec;
262 if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
263 // Scalar parameter with empty shape but known rank.
264 shape_vec.push_back(1);
265 } else {
266 for (const auto& d : shape_pb.dim()) {
267 shape_vec.push_back(d.size());
268 }
269 }
270 return shape_vec;
271 }
272
VecToShapeProto(const std::vector<int64> & shape_vec)273 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
274 TensorShapeProto shape_pb;
275 if (shape_vec.empty()) {
276 shape_pb.set_unknown_rank(true);
277 return shape_pb;
278 }
279 for (const int64 s : shape_vec) {
280 shape_pb.add_dim()->set_size(s);
281 }
282 return shape_pb;
283 }
284
IsPlacedOnAccelerator(const string & device)285 bool IsPlacedOnAccelerator(const string& device) {
286 return device.find("gpu") != device.npos;
287 }
IsPlacedOnCPU(const string & device)288 bool IsPlacedOnCPU(const string& device) {
289 return device.find("cpu") != device.npos;
290 }
291 } // namespace tfprof
292 } // namespace tensorflow
293