1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17 
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19 
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23   return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26   return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
27 }
IsCanonicalDevice(const string & device)28 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
29 
30 // Notes about start and end time from the NodeExecStats proto:
31 // For GPU, there is no difference between op_end_rel_micros and
32 // all_end_rel_micros. All are kernel times.
33 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
34 // some post-processing. Besides, currently, there is no way to measure
35 // the execution time of async ops accurately.
36 //
37 // Notes about device:
38 // For ops on gpu:
39 // It will appear in three different devices in RunMetadata: 1) gpu:x,
40 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
41 // of all different 3). 1) is the op scheduling, pre-processing and
42 // post processing time. 3) is the execution time of GPU kernels on a stream.
43 // For ops on cpu:
44 // It will only appear as cpu:0.
45 
AddTimeStats(const string & dev,const NodeExecStats & step_stat)46 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
47   devices_.insert(dev);
48   if (step_stat.all_start_micros() > 0) {
49     if (exec_.all_start_micros() > 0) {
50       exec_.set_all_start_micros(
51           std::min(static_cast<int64>(exec_.all_start_micros()),
52                    static_cast<int64>(step_stat.all_start_micros())));
53     } else {
54       exec_.set_all_start_micros(step_stat.all_start_micros());
55     }
56     int64 op_end_rel_micros = step_stat.op_end_rel_micros();
57     // Round quick execution to 1 micro to be semantically robust.
58     if (op_end_rel_micros == 0) {
59       ++op_end_rel_micros;
60     }
61     exec_.set_latest_end_micros(
62         std::max(static_cast<int64>(exec_.latest_end_micros()),
63                  step_stat.all_start_micros() + op_end_rel_micros));
64 
65     const std::pair<int64, int64> pair =
66         std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
67     if (CountAsAcceleratorTime(dev)) {
68       accelerator_execs_[dev].push_back(pair);
69       op_execs_[dev].push_back(pair);
70     } else if (CountAsCPUTime(dev)) {
71       cpu_execs_[dev].push_back(pair);
72       op_execs_[dev].push_back(pair);
73       // In while-loop, a graph node is executed multiple times under
74       // the same name.
75       exec_.set_run_count(exec_.run_count() + 1);
76     }
77   }
78 }
79 
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)80 void ExecStep::AddMemoryStats(const string& dev,
81                               const NodeExecStats& step_stat) {
82   ExecMemory exec_mem;
83   if (step_stat.all_start_micros() > 0) {
84     exec_mem.set_memory_micros(step_stat.all_start_micros() +
85                                step_stat.op_end_rel_micros());
86   } else {
87     absl::FPrintF(stderr, "%s has no start time, skipping\n",
88                   step_stat.node_name());
89     return;
90   }
91 
92   int accelerator_allocator_cnt = 0;
93   for (const auto& mem : step_stat.memory()) {
94     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
95     // ad-hoc.
96     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
97       continue;
98     }
99     ++accelerator_allocator_cnt;
100     exec_mem.set_allocator_bytes_in_use(
101         std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
102                  static_cast<int64>(mem.allocator_bytes_in_use())));
103     for (const auto& alloc : mem.allocation_records()) {
104       allocations_.push_back(alloc);
105     }
106   }
107   if (accelerator_allocator_cnt > 1) {
108     absl::FPrintF(stderr, "found %d gpu allocator for 1 node\n",
109                   accelerator_allocator_cnt);
110   }
111 
112   int64 total_output_bytes = 0;
113   for (const auto& output : step_stat.output()) {
114     if (output.has_tensor_description() &&
115         output.tensor_description().has_allocation_description()) {
116       // TODO(xpan): Maybe allocated_bytes.
117       int64 output_bytes = std::max(output.tensor_description()
118                                         .allocation_description()
119                                         .allocated_bytes(),
120                                     output.tensor_description()
121                                         .allocation_description()
122                                         .requested_bytes());
123       uint64 output_ptr =
124           output.tensor_description().allocation_description().ptr();
125       total_output_bytes += output_bytes;
126 
127       auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
128       mem.set_ptr(output_ptr);
129       mem.set_bytes(output_bytes);
130     }
131   }
132   exec_mem.set_output_bytes(total_output_bytes);
133 
134   if (step_stat.has_memory_stats()) {
135     if (IsPlacedOnCPU(dev)) {
136       // Currently we assume ops placed on gpu only allocate memory on gpu.
137       exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
138                                    step_stat.memory_stats().temp_memory_size());
139       exec_mem.set_host_persistent_bytes(
140           exec_mem.host_persistent_bytes() +
141           step_stat.memory_stats().persistent_memory_size());
142     } else {
143       exec_mem.set_accelerator_temp_bytes(
144           exec_mem.accelerator_temp_bytes() +
145           step_stat.memory_stats().temp_memory_size());
146       exec_mem.set_accelerator_persistent_bytes(
147           exec_mem.accelerator_persistent_bytes() +
148           step_stat.memory_stats().persistent_memory_size());
149     }
150   }
151 
152   // TODO(xpan): Make this more accurate:
153   // High level: Memory tracking is suspicious and requires large scale
154   // clean up.
155   // Investigate the memory usage difference between CPU/GPU with OpViewTest.
156   //
157   // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
158   //    discuss OpKernelContext-related allocations.
159   // 2. allocate_output calls allocate_tensor, which is properly tracked in
160   //    'NodeExecStats.memory'.
161   // 3. allocate_temp is only tracked through record_xxx_temp. It appears
162   //    in 'NodeExecStats.memory_stats'.
163   // 4. allocate_persistent calls allocate_tensor, which is properly tracked
164   //    in 'NodeExecStats.memory'. However, there is no way to count it as
165   //    persistent now.
166   // 5. record_xxx_persistent is called when allocate_persistent
167   //    is not used and hence tracks some complementary bytes. It appears in
168   //    'NodeExecStats.memory_stats'. It's suspicious. But we should
169   //    use it now since it covers constant op.
170   int64 residual_bytes = 0;
171   int64 requested_bytes = 0;
172   int64 peak_bytes = 0;
173   for (const auto& mem : step_stat.memory()) {
174     residual_bytes += mem.live_bytes();
175     requested_bytes += mem.total_bytes();
176     peak_bytes += mem.peak_bytes();
177   }
178   residual_bytes += exec_mem.host_persistent_bytes() +
179                     exec_mem.accelerator_persistent_bytes();
180   requested_bytes += exec_mem.host_persistent_bytes() +
181                      exec_mem.accelerator_persistent_bytes() +
182                      exec_mem.host_temp_bytes() +
183                      exec_mem.accelerator_temp_bytes();
184   peak_bytes += exec_mem.host_persistent_bytes() +
185                 exec_mem.accelerator_persistent_bytes() +
186                 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
187 
188   exec_mem.set_requested_bytes(requested_bytes);
189   exec_mem.set_residual_bytes(residual_bytes);
190   exec_mem.set_peak_bytes(peak_bytes);
191   memory_execs_.emplace_back(exec_mem);
192 }
193 
AddStepStat(int64 step,const string & device,const NodeExecStats & step_stat)194 void TFGraphNode::AddStepStat(int64 step, const string& device,
195                               const NodeExecStats& step_stat) {
196   string dev = absl::AsciiStrToLower(device);
197 
198   // TODO(xpan): Make this more robust?
199   // See run_metadata_test.py
200   // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
201   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
202   if (IsCanonicalDevice(dev)) {
203     if (!node_.canonical_device().empty()) {
204       if (node_.canonical_device() != dev) {
205         // TODO(xpan): Some RunMetadata node appears at multiple devices.
206         // Need to address it.
207         return;
208       }
209     } else {
210       node_.set_canonical_device(dev);
211       // TODO(xpan): Support things other than gpu?
212       node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
213       AddOpType(node_.canonical_device());
214     }
215   }
216 
217   auto exec = execs_.find(step);
218   if (exec == execs_.end()) {
219     execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
220     exec = execs_.find(step);
221   }
222 
223   exec->second.AddTimeStats(dev, step_stat);
224 
225   if (dev == node_.canonical_device()) {
226     exec->second.AddMemoryStats(dev, step_stat);
227   }
228 }
229 
exec_micros() const230 int64 ExecStep::exec_micros() const {
231   return accelerator_exec_micros() + cpu_exec_micros();
232 }
233 
accelerator_exec_micros() const234 int64 ExecStep::accelerator_exec_micros() const {
235   int64 total = 0;
236   // Normally, an op should only be scheduled on 1 accelerator device.
237   // Hence there should generally be 1 element in accelerator_execs_.
238   for (const auto& execs : accelerator_execs_) {
239     // An op can fire multiple kernels or
240     // being scheduled multiple times in while-loop.
241     for (const auto& exec : execs.second) {
242       total += exec.second;
243     }
244   }
245   return total;
246 }
247 
cpu_exec_micros() const248 int64 ExecStep::cpu_exec_micros() const {
249   int64 total = 0;
250   // Normally, an op can only be scheduled on 1 device.
251   for (const auto& execs : cpu_execs_) {
252     // An op can be scheduled multiple times in while-loop.
253     for (const auto& exec : execs.second) {
254       total += exec.second;
255     }
256   }
257   return total;
258 }
259 
ShapeProtoToVec(const TensorShapeProto & shape_pb)260 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
261   std::vector<int64> shape_vec;
262   if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
263     // Scalar parameter with empty shape but known rank.
264     shape_vec.push_back(1);
265   } else {
266     for (const auto& d : shape_pb.dim()) {
267       shape_vec.push_back(d.size());
268     }
269   }
270   return shape_vec;
271 }
272 
VecToShapeProto(const std::vector<int64> & shape_vec)273 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
274   TensorShapeProto shape_pb;
275   if (shape_vec.empty()) {
276     shape_pb.set_unknown_rank(true);
277     return shape_pb;
278   }
279   for (const int64 s : shape_vec) {
280     shape_pb.add_dim()->set_size(s);
281   }
282   return shape_pb;
283 }
284 
IsPlacedOnAccelerator(const string & device)285 bool IsPlacedOnAccelerator(const string& device) {
286   return device.find("gpu") != device.npos;
287 }
IsPlacedOnCPU(const string & device)288 bool IsPlacedOnCPU(const string& device) {
289   return device.find("cpu") != device.npos;
290 }
291 }  // namespace tfprof
292 }  // namespace tensorflow
293