1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17 
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19 
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23   return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26   return RE2::FullMatch(device,
27                         ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
28 }
IsCanonicalDevice(const string & device)29 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
30 
31 // Notes about start and end time from the NodeExecStats proto:
32 // For GPU, there is no difference between op_end_rel_micros and
33 // all_end_rel_micros. All are kernel times.
34 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
35 // some post-processing. Besides, currently, there is no way to measure
36 // the execution time of async ops accurately.
37 //
38 // Notes about device:
39 // For ops on gpu:
40 // It will appear in three different devices in RunMetadata: 1) gpu:x,
41 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
42 // of all different 3). 1) is the op scheduling, pre-processing and
43 // post processing time. 3) is the execution time of GPU kernels on a stream.
44 // For ops on cpu:
45 // It will only appear as cpu:0.
46 
AddTimeStats(const string & dev,const NodeExecStats & step_stat)47 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
48   devices_.insert(dev);
49   if (step_stat.all_start_micros() > 0) {
50     if (exec_.all_start_micros() > 0) {
51       exec_.set_all_start_micros(
52           std::min(static_cast<int64>(exec_.all_start_micros()),
53                    static_cast<int64>(step_stat.all_start_micros())));
54     } else {
55       exec_.set_all_start_micros(step_stat.all_start_micros());
56     }
57     int64 op_end_rel_micros = step_stat.op_end_rel_micros();
58     // Round quick execution to 1 micro to be semantically robust.
59     if (op_end_rel_micros == 0) {
60       ++op_end_rel_micros;
61     }
62     exec_.set_latest_end_micros(
63         std::max(static_cast<int64>(exec_.latest_end_micros()),
64                  step_stat.all_start_micros() + op_end_rel_micros));
65 
66     const std::pair<int64, int64> pair =
67         std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
68     if (CountAsAcceleratorTime(dev)) {
69       accelerator_execs_[dev].push_back(pair);
70       op_execs_[dev].push_back(pair);
71     } else if (CountAsCPUTime(dev)) {
72       cpu_execs_[dev].push_back(pair);
73       op_execs_[dev].push_back(pair);
74       // In while-loop, a graph node is executed multiple times under
75       // the same name.
76       exec_.set_run_count(exec_.run_count() + 1);
77     }
78   }
79 }
80 
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)81 void ExecStep::AddMemoryStats(const string& dev,
82                               const NodeExecStats& step_stat) {
83   ExecMemory exec_mem;
84   if (step_stat.all_start_micros() > 0) {
85     exec_mem.set_memory_micros(step_stat.all_start_micros() +
86                                step_stat.op_end_rel_micros());
87   } else {
88     fprintf(stderr, "%s has no start time, skipping\n",
89             step_stat.node_name().c_str());
90     return;
91   }
92 
93   int accelerator_allocator_cnt = 0;
94   for (const auto& mem : step_stat.memory()) {
95     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
96     // ad-hoc.
97     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
98       continue;
99     }
100     ++accelerator_allocator_cnt;
101     exec_mem.set_allocator_bytes_in_use(
102         std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
103                  static_cast<int64>(mem.allocator_bytes_in_use())));
104     for (const auto& alloc : mem.allocation_records()) {
105       allocations_.push_back(alloc);
106     }
107   }
108   if (accelerator_allocator_cnt > 1) {
109     fprintf(stderr, "found %d gpu allocator for 1 node\n",
110             accelerator_allocator_cnt);
111   }
112 
113   int64 total_output_bytes = 0;
114   for (const auto& output : step_stat.output()) {
115     if (output.has_tensor_description() &&
116         output.tensor_description().has_allocation_description()) {
117       // TODO(xpan): Maybe allocated_bytes.
118       int64 output_bytes = std::max(output.tensor_description()
119                                         .allocation_description()
120                                         .allocated_bytes(),
121                                     output.tensor_description()
122                                         .allocation_description()
123                                         .requested_bytes());
124       uint64 output_ptr =
125           output.tensor_description().allocation_description().ptr();
126       total_output_bytes += output_bytes;
127 
128       auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
129       mem.set_ptr(output_ptr);
130       mem.set_bytes(output_bytes);
131     }
132   }
133   exec_mem.set_output_bytes(total_output_bytes);
134 
135   if (step_stat.has_memory_stats()) {
136     if (IsPlacedOnCPU(dev)) {
137       // Currently we assume ops placed on gpu only allocate memory on gpu.
138       exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
139                                    step_stat.memory_stats().temp_memory_size());
140       exec_mem.set_host_persistent_bytes(
141           exec_mem.host_persistent_bytes() +
142           step_stat.memory_stats().persistent_memory_size());
143     } else {
144       exec_mem.set_accelerator_temp_bytes(
145           exec_mem.accelerator_temp_bytes() +
146           step_stat.memory_stats().temp_memory_size());
147       exec_mem.set_accelerator_persistent_bytes(
148           exec_mem.accelerator_persistent_bytes() +
149           step_stat.memory_stats().persistent_memory_size());
150     }
151   }
152 
153   // TODO(xpan): Make this more accurate:
154   // High level: Memory tracking is suspicious and requires large scale
155   // clean up.
156   // Investigte the memory usage difference between CPU/GPU with OpViewTest.
157   //
158   // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
159   //    discuss OpKernelContext-related allocations.
160   // 2. allocate_output calls allocate_tensor, which is properly tracked in
161   //    'NodeExecStats.memory'.
162   // 3. allocate_temp is only tracked through record_xxx_temp. It appears
163   //    in 'NodeExecStats.memory_stats'.
164   // 4. allocate_persistent calls allocate_tensor, which is properly tracked
165   //    in 'NodeExecStats.memory'. However, there is no way to count it as
166   //    persistent now.
167   // 5. record_xxx_persistent is called when allocate_persistent
168   //    is not used and hence tracks some complementary bytes. It appears in
169   //    'NodeExecStats.memory_stats'. It's suspicious. But we should
170   //    use it now since it covers constant op.
171   int64 residual_bytes = 0;
172   int64 requested_bytes = 0;
173   int64 peak_bytes = 0;
174   for (const auto& mem : step_stat.memory()) {
175     residual_bytes += mem.live_bytes();
176     requested_bytes += mem.total_bytes();
177     peak_bytes += mem.peak_bytes();
178   }
179   residual_bytes += exec_mem.host_persistent_bytes() +
180                     exec_mem.accelerator_persistent_bytes();
181   requested_bytes += exec_mem.host_persistent_bytes() +
182                      exec_mem.accelerator_persistent_bytes() +
183                      exec_mem.host_temp_bytes() +
184                      exec_mem.accelerator_temp_bytes();
185   peak_bytes += exec_mem.host_persistent_bytes() +
186                 exec_mem.accelerator_persistent_bytes() +
187                 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
188 
189   exec_mem.set_requested_bytes(requested_bytes);
190   exec_mem.set_residual_bytes(residual_bytes);
191   exec_mem.set_peak_bytes(peak_bytes);
192   memory_execs_.emplace_back(exec_mem);
193 }
194 
AddStepStat(int64 step,const string & device,const NodeExecStats & step_stat)195 void TFGraphNode::AddStepStat(int64 step, const string& device,
196                               const NodeExecStats& step_stat) {
197   string dev = str_util::Lowercase(device);
198 
199   // TODO(xpan): Make this more robust?
200   // See run_metadata_test.py
201   // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
202   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
203   if (IsCanonicalDevice(dev)) {
204     if (!node_.canonical_device().empty()) {
205       if (node_.canonical_device() != dev) {
206         // TODO(xpan): Some RunMetadata node appears at multiple devices.
207         // Need to address it.
208         return;
209       }
210     } else {
211       node_.set_canonical_device(dev);
212       // TODO(xpan): Support things other than gpu?
213       if (dev.find("sycl") != dev.npos) {
214         node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
215       } else {
216         node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
217       }
218       AddOpType(node_.canonical_device());
219     }
220   }
221 
222   auto exec = execs_.find(step);
223   if (exec == execs_.end()) {
224     execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
225     exec = execs_.find(step);
226   }
227 
228   exec->second.AddTimeStats(dev, step_stat);
229 
230   if (dev == node_.canonical_device()) {
231     exec->second.AddMemoryStats(dev, step_stat);
232   }
233 }
234 
exec_micros() const235 int64 ExecStep::exec_micros() const {
236   return accelerator_exec_micros() + cpu_exec_micros();
237 }
238 
accelerator_exec_micros() const239 int64 ExecStep::accelerator_exec_micros() const {
240   int64 total = 0;
241   // Normally, an op should only be scheduled on 1 accelerator device.
242   // Hence there should generally be 1 element in accelerator_execs_.
243   for (const auto& execs : accelerator_execs_) {
244     // An op can fire multiple kernels or
245     // being scheduled multiple times in while-loop.
246     for (const auto& exec : execs.second) {
247       total += exec.second;
248     }
249   }
250   return total;
251 }
252 
cpu_exec_micros() const253 int64 ExecStep::cpu_exec_micros() const {
254   int64 total = 0;
255   // Normally, an op can only be scheduled on 1 device.
256   for (const auto& execs : cpu_execs_) {
257     // An op can be scheduled multiple times in while-loop.
258     for (const auto& exec : execs.second) {
259       total += exec.second;
260     }
261   }
262   return total;
263 }
264 
ShapeProtoToVec(const TensorShapeProto & shape_pb)265 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
266   std::vector<int64> shape_vec;
267   if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
268     // Scalar parameter with empty shape but known rank.
269     shape_vec.push_back(1);
270   } else {
271     for (const auto& d : shape_pb.dim()) {
272       shape_vec.push_back(d.size());
273     }
274   }
275   return shape_vec;
276 }
277 
VecToShapeProto(const std::vector<int64> & shape_vec)278 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
279   TensorShapeProto shape_pb;
280   if (shape_vec.empty()) {
281     shape_pb.set_unknown_rank(true);
282     return shape_pb;
283   }
284   for (const int64 s : shape_vec) {
285     shape_pb.add_dim()->set_size(s);
286   }
287   return shape_pb;
288 }
289 
IsPlacedOnAccelerator(const string & device)290 bool IsPlacedOnAccelerator(const string& device) {
291   return device.find("gpu") != device.npos ||
292          device.find("sycl") != device.npos;
293 }
IsPlacedOnCPU(const string & device)294 bool IsPlacedOnCPU(const string& device) {
295   return device.find("cpu") != device.npos;
296 }
297 }  // namespace tfprof
298 }  // namespace tensorflow
299