1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
18 
19 #include <map>
20 #include <set>
21 #include <string>
22 #include <vector>
23 
24 #include "absl/strings/str_format.h"
25 #include "tensorflow/core/framework/allocation_description.pb.h"
26 #include "tensorflow/core/framework/attr_value.pb.h"
27 #include "tensorflow/core/framework/node_def.pb.h"
28 #include "tensorflow/core/framework/step_stats.pb.h"
29 #include "tensorflow/core/framework/tensor_description.pb.h"
30 #include "tensorflow/core/framework/tensor_shape.pb.h"
31 #include "tensorflow/core/lib/core/errors.h"
32 #include "tensorflow/core/platform/regexp.h"
33 #include "tensorflow/core/profiler/tfprof_log.pb.h"
34 #include "tensorflow/core/profiler/tfprof_options.h"
35 
36 namespace tensorflow {
37 namespace tfprof {
38 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb);
39 
40 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec);
41 
42 class TFGraphNode;
43 
44 class CallStack {
45  public:
46   class Trace {
47    public:
Trace(const CodeDef::Trace * trace,const std::map<int64,string> * id_to_string)48     Trace(const CodeDef::Trace* trace,
49           const std::map<int64, string>* id_to_string)
50         : trace_(trace), id_to_string_(id_to_string) {}
51 
lineno()52     const int32 lineno() const { return trace_->lineno(); }
file()53     string file() const {
54       // Backward compatible with old proto files.
55       if (!trace_->file().empty()) return trace_->file();
56       return id_to_string_->at(trace_->file_id());
57     }
function()58     string function() const {
59       // Backward compatible with old proto files.
60       if (!trace_->function().empty()) return trace_->function();
61       return id_to_string_->at(trace_->function_id());
62     }
func_start_line()63     int32 func_start_line() const { return trace_->func_start_line(); }
64 
65    private:
66     const CodeDef::Trace* trace_;
67     const std::map<int64, string>* id_to_string_;
68   };
69 
CallStack(const CodeDef & def,const std::map<int64,string> * id_to_string)70   CallStack(const CodeDef& def, const std::map<int64, string>* id_to_string)
71       : def_(def) {
72     traces_.reserve(def.traces_size());
73     for (const auto& t : def_.traces()) {
74       traces_.emplace_back(&t, id_to_string);
75     }
76   }
77 
code_def()78   const CodeDef& code_def() const { return def_; }
traces()79   const std::vector<Trace>& traces() const { return traces_; }
80 
81  private:
82   std::vector<Trace> traces_;
83   CodeDef def_;
84 };
85 
86 class ExecStep {
87  public:
ExecStep()88   ExecStep() {}
89 
90   void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
91 
92   void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
93 
run_count()94   int64 run_count() const { return exec_.run_count(); }
95   // The execution time of an op. If it runs on accelerator, then it's
96   // accelerator_exec_micros(). Otherwise, it's CPU time.
97   int64 exec_micros() const;
98   // The accelerator execution time of an op. 0 if not run on accelerator.
99   int64 accelerator_exec_micros() const;
100   // The cpu execution time of an op.
101   int64 cpu_exec_micros() const;
102 
op_execs()103   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs()
104       const {
105     return op_execs_;
106   }
cpu_execs()107   const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs()
108       const {
109     return cpu_execs_;
110   }
all_start_micros()111   int64 all_start_micros() const { return exec_.all_start_micros(); }
latest_end_micros()112   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
lastest_schedule_end_micros()113   int64 lastest_schedule_end_micros() const {
114     int64 ret = 0;
115     for (const auto& exec : cpu_execs_) {
116       for (const auto& pair : exec.second) {
117         ret = std::max(ret, pair.first + pair.second);
118       }
119     }
120     return ret;
121   }
requested_bytes()122   int64 requested_bytes() const {
123     int64 requested_bytes = 0;
124     for (const ExecMemory& exec : memory_execs_) {
125       requested_bytes += exec.requested_bytes();
126     }
127     return requested_bytes;
128   }
peak_bytes()129   int64 peak_bytes() const {
130     int64 peak_bytes = 0;
131     for (const ExecMemory& exec : memory_execs_) {
132       peak_bytes += exec.peak_bytes();
133     }
134     return peak_bytes;
135   }
residual_bytes()136   int64 residual_bytes() const {
137     int64 residual_bytes = 0;
138     for (const ExecMemory& exec : memory_execs_) {
139       residual_bytes += exec.residual_bytes();
140     }
141     return residual_bytes;
142   }
output_bytes()143   int64 output_bytes() const {
144     int64 output_bytes = 0;
145     for (const ExecMemory& exec : memory_execs_) {
146       output_bytes += exec.output_bytes();
147     }
148     return output_bytes;
149   }
accelerator_temp_bytes()150   int64 accelerator_temp_bytes() const {
151     int64 accelerator_temp_bytes = 0;
152     for (const ExecMemory& exec : memory_execs_) {
153       accelerator_temp_bytes += exec.accelerator_temp_bytes();
154     }
155     return accelerator_temp_bytes;
156   }
host_temp_bytes()157   int64 host_temp_bytes() const {
158     int64 host_temp_bytes = 0;
159     for (const ExecMemory& exec : memory_execs_) {
160       host_temp_bytes += exec.host_temp_bytes();
161     }
162     return host_temp_bytes;
163   }
accelerator_persistent_bytes()164   int64 accelerator_persistent_bytes() const {
165     int64 accelerator_persistent_bytes = 0;
166     for (const ExecMemory& exec : memory_execs_) {
167       accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
168     }
169     return accelerator_persistent_bytes;
170   }
host_persistent_bytes()171   int64 host_persistent_bytes() const {
172     int64 host_persistent_bytes = 0;
173     for (const ExecMemory& exec : memory_execs_) {
174       host_persistent_bytes += exec.host_persistent_bytes();
175     }
176     return host_persistent_bytes;
177   }
allocator_bytes_in_use()178   std::map<int64, int64> allocator_bytes_in_use() const {
179     std::map<int64, int64> bytes_in_use;
180     for (const ExecMemory& exec : memory_execs_) {
181       bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
182     }
183     return bytes_in_use;
184   }
185 
allocations()186   const std::vector<AllocationRecord>& allocations() const {
187     return allocations_;
188   }
189 
ToProto()190   const ExecProfile& ToProto() {
191     exec_.mutable_accelerator_execs()->clear();
192     for (const auto& e : accelerator_execs_) {
193       auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first];
194       for (const auto& p : e.second) {
195         auto* t = exec_time.mutable_times()->Add();
196         t->add_int64_values(p.first);
197         t->add_int64_values(p.second);
198       }
199     }
200 
201     exec_.mutable_cpu_execs()->clear();
202     for (const auto& e : cpu_execs_) {
203       auto& exec_time = (*exec_.mutable_cpu_execs())[e.first];
204       for (const auto& p : e.second) {
205         auto* t = exec_time.mutable_times()->Add();
206         t->add_int64_values(p.first);
207         t->add_int64_values(p.second);
208       }
209     }
210 
211     exec_.mutable_devices()->Clear();
212     exec_.mutable_devices()->Reserve(devices_.size());
213     for (const string& d : devices_) {
214       exec_.add_devices(d);
215     }
216     exec_.mutable_allocations()->Clear();
217     for (const auto& r : allocations_) {
218       exec_.add_allocations()->MergeFrom(r);
219     }
220 
221     exec_.mutable_memory_execs()->Clear();
222     for (const auto& m : memory_execs_) {
223       exec_.add_memory_execs()->MergeFrom(m);
224     }
225     return exec_;
226   }
227 
FromProto(const ExecProfile & exec)228   void FromProto(const ExecProfile& exec) {
229     exec_.Clear();
230     exec_.MergeFrom(exec);
231 
232     devices_.clear();
233     devices_.insert(exec.devices().begin(), exec.devices().end());
234 
235     accelerator_execs_.clear();
236     cpu_execs_.clear();
237     op_execs_.clear();
238 
239     allocations_.clear();
240     memory_execs_.clear();
241 
242     for (const auto& exec_time : exec_.accelerator_execs()) {
243       auto& exec = accelerator_execs_[exec_time.first];
244       auto& op_exec = op_execs_[exec_time.first];
245       for (const auto& p : exec_time.second.times()) {
246         exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
247         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
248       }
249     }
250     for (const auto& exec_time : exec_.cpu_execs()) {
251       auto& exec = cpu_execs_[exec_time.first];
252       auto& op_exec = op_execs_[exec_time.first];
253       for (const auto& p : exec_time.second.times()) {
254         exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
255         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
256       }
257     }
258     for (const auto& r : exec_.allocations()) {
259       allocations_.push_back(r);
260     }
261     for (const auto& m : exec_.memory_execs()) {
262       memory_execs_.push_back(m);
263     }
264   }
265 
266  private:
267   ExecProfile exec_;
268   // device -> vector of {op_start_micros, op_exec_micros} pairs.
269   // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
270   // For accelerator, vector size can be larger than 1, multiple kernel fires
271   // or in tf.while_loop.
272   std::map<string, std::vector<std::pair<int64, int64>>> accelerator_execs_;
273   // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
274   // For cpu, vector size can be larger than 1 if in tf.while_loop.
275   std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
276   // combines accelerator_execs_ and cpu_execs_.
277   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
278   // Each ExecMemory corresponds to one scheduling of the op. Normally,
279   // there are multiple schedulings in while_loop.
280   std::vector<ExecMemory> memory_execs_;
281   // All devices the op is associated with (e.g. gpu:0 (scheduling),
282   // gpu:0:stream:xx (kernel exec), cpu:0 host)
283   std::set<string> devices_;
284 
285   // The history of accelerator allocations and deallocations of this step.
286   std::vector<AllocationRecord> allocations_;
287 };
288 
289 #define GRAPH_NODE_BYTES(type)             \
290   do {                                     \
291     if (execs_.empty()) {                  \
292       return 0;                            \
293     }                                      \
294     if (step >= 0) {                       \
295       auto exec = execs_.find(step);       \
296       if (exec == execs_.end()) return 0;  \
297       return exec->second.type##_bytes();  \
298     }                                      \
299                                            \
300     int64 bytes = 0;                       \
301     for (const auto& exec : execs_) {      \
302       bytes += exec.second.type##_bytes(); \
303     }                                      \
304     return bytes / execs_.size();          \
305   } while (0)
306 
307 class TFGraphNode {
308  public:
TFGraphNode(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)309   TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
310               const std::map<int64, string>* id_to_string,
311               const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
312     nodes_map_ = nodes_map;
313     FromProto(node, profile, id_to_string);
314   }
315 
TFGraphNode(const NodeDef * node,int64 id,const std::map<string,std::unique_ptr<TFGraphNode>> * nodes_map)316   TFGraphNode(const NodeDef* node, int64 id,
317               const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
318     nodes_map_ = nodes_map;
319     node_.set_id(id);
320     node_.set_name(node->name());
321     node_.set_op(node->op());
322     node_.set_float_ops(0);
323 
324     for (const auto& attr : node->attr()) {
325       (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second);
326       if (attr.first == "shape" && attr.second.has_shape()) {
327         if (!shape_.empty()) {
328           absl::FPrintF(stderr, "Found duplicated shapes!\n");
329           continue;
330         }
331         shape_ = ShapeProtoToVec(attr.second.shape());
332       } else if (attr.first == "_output_shapes" && attr.second.has_list()) {
333         if (!output_shapes_.empty()) {
334           absl::FPrintF(stderr, "Found duplicated output shapes!\n");
335           continue;
336         }
337         for (int i = 0; i < attr.second.list().shape_size(); ++i) {
338           output_shapes_[i] = ShapeProtoToVec(attr.second.list().shape(i));
339         }
340       }
341     }
342     op_types_.insert(node->op());
343   }
344 
AddInput(const string & input,int64 output_index,int input_idx)345   void AddInput(const string& input, int64 output_index, int input_idx) {
346     inputs_[input_idx] = input;
347     src_output_idx_[input] = output_index;
348   }
349 
AddOpType(const string & op_type)350   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
351 
352   void AddStepStat(int64 step, const string& device,
353                    const NodeExecStats& step_stat);
354 
AddFloatOps(int64 float_ops)355   void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); }
356 
357   // TODO(xpan): This could take a lot of memory.
AddCode(const CodeDef & code,const std::map<int64,string> * id_to_string)358   void AddCode(const CodeDef& code,
359                const std::map<int64, string>* id_to_string) {
360     if (!call_stack_) {
361       call_stack_.reset(new CallStack(code, id_to_string));
362     }
363   }
364 
name()365   const string& name() const { return node_.name(); }
id()366   int64 id() const { return node_.id(); }
op()367   const string& op() const { return node_.op(); }
node()368   const ProfileNode& node() { return node_; }
369 
trackable(int64 step)370   bool trackable(int64 step) const {
371     auto exec = execs_.find(step);
372     if (exec == execs_.end()) return false;
373 
374     if (exec->second.all_start_micros() == 0) return false;
375     if (node_.canonical_device().empty() || node_.host_device().empty()) {
376       return false;
377     }
378     return true;
379   }
380 
ToProto(const std::map<string,std::unique_ptr<TFGraphNode>> & nodes_map)381   const ProfileNode& ToProto(
382       const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) {
383     node_.clear_shape();
384     node_.mutable_shape()->Reserve(shape().size());
385     for (int64 s : shape()) {
386       node_.add_shape(s);
387     }
388 
389     node_.clear_op_types();
390     node_.mutable_op_types()->Reserve(op_types().size());
391     for (const string& t : op_types()) {
392       node_.add_op_types(t);
393     }
394 
395     node_.clear_execs();
396     for (auto& exec : execs_) {
397       auto& exec_pb = (*node_.mutable_execs())[exec.first];
398       exec_pb.MergeFrom(exec.second.ToProto());
399     }
400 
401     node_.clear_inputs();
402     for (const auto& inp : inputs_) {
403       (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id();
404     }
405 
406     node_.clear_input_shapes();
407     for (const auto& s : input_shapes_) {
408       auto& shape = (*node_.mutable_input_shapes())[s.first];
409       for (int64 d : s.second) {
410         shape.add_int64_values(d);
411       }
412     }
413 
414     node_.clear_output_shapes();
415     for (const auto& s : output_shapes_) {
416       auto& shape = (*node_.mutable_output_shapes())[s.first];
417       for (int64 d : s.second) {
418         shape.add_int64_values(d);
419       }
420     }
421 
422     node_.clear_src_output_index();
423     for (const auto& s : src_output_idx_) {
424       int64 id = nodes_map.at(s.first)->id();
425       (*node_.mutable_src_output_index())[id] = s.second;
426     }
427 
428     if (call_stack_) {
429       node_.clear_trace();
430       node_.mutable_trace()->MergeFrom(call_stack_->code_def());
431     }
432     return node_;
433   }
434 
FromProto(const ProfileNode & node,const ProfileProto & profile,const std::map<int64,string> * id_to_string)435   void FromProto(const ProfileNode& node, const ProfileProto& profile,
436                  const std::map<int64, string>* id_to_string) {
437     node_.Clear();
438     node_.MergeFrom(node);
439 
440     call_stack_.reset(new CallStack(node.trace(), id_to_string));
441 
442     op_types_.clear();
443     op_types_.insert(node_.op_types().begin(), node_.op_types().end());
444 
445     shape_.clear();
446     for (int64 s : node_.shape()) {
447       shape_.push_back(s);
448     }
449 
450     execs_.clear();
451     for (const auto& exec_pb : node.execs()) {
452       auto& exec = execs_[exec_pb.first];
453       exec.FromProto(exec_pb.second);
454     }
455 
456     inputs_.clear();
457     for (const auto& inp : node.inputs()) {
458       inputs_[inp.first] = profile.nodes().at(inp.second).name();
459     }
460 
461     input_shapes_.clear();
462     for (const auto& s : node.input_shapes()) {
463       auto& shape = input_shapes_[s.first];
464       for (const int64 d : s.second.int64_values()) {
465         shape.push_back(d);
466       }
467     }
468 
469     output_shapes_.clear();
470     for (const auto& s : node.output_shapes()) {
471       auto& shape = output_shapes_[s.first];
472       for (const int64 d : s.second.int64_values()) {
473         shape.push_back(d);
474       }
475     }
476 
477     src_output_idx_.clear();
478     for (const auto& s : node.src_output_index()) {
479       src_output_idx_[profile.nodes().at(s.first).name()] = s.second;
480     }
481   }
482 
inputs()483   const std::map<int32, string>& inputs() const { return inputs_; }
484 
485   // Number of times the graph node is executed. When step < 0, the
486   // average number of times executed across all steps.
run_count(int64 step)487   int64 run_count(int64 step) const {
488     if (execs_.empty()) {
489       return 0;
490     }
491     if (step >= 0) {
492       auto exec = execs_.find(step);
493       if (exec == execs_.end()) {
494         return 0;
495       }
496       return exec->second.run_count();
497     }
498     int64 total_run_count = 0;
499     for (const auto& exec : execs_) {
500       total_run_count += exec.second.run_count();
501     }
502     return total_run_count / execs_.size();
503   }
504   // This is overall computation time, including both cpu and accelerator.
505   // Note, cpu and accelerator might or might not run in parallel.
exec_micros(int64 step)506   int64 exec_micros(int64 step) const {
507     // Empty when no RunMetadata is provided.
508     if (execs_.empty()) {
509       return 0;
510     }
511     if (step >= 0) {
512       auto exec = execs_.find(step);
513       if (exec == execs_.end()) {
514         return 0;
515       }
516       return exec->second.exec_micros();
517     }
518 
519     int64 total_micros = 0;
520     for (const auto& exec : execs_) {
521       total_micros += exec.second.exec_micros();
522     }
523     return total_micros / execs_.size();
524   }
525 
526   // This is accelerator computation time of a step, or average of
527   // multiple step, when step < 0.
accelerator_exec_micros(int64 step)528   int64 accelerator_exec_micros(int64 step) const {
529     // Empty when no RunMetadata is provided.
530     if (execs_.empty()) {
531       return 0;
532     }
533     if (step >= 0) {
534       auto exec = execs_.find(step);
535       if (exec == execs_.end()) {
536         return 0;
537       }
538       return exec->second.accelerator_exec_micros();
539     }
540 
541     int64 total_micros = 0;
542     for (const auto& exec : execs_) {
543       total_micros += exec.second.accelerator_exec_micros();
544     }
545     return total_micros / execs_.size();
546   }
547 
548   // This is cpu computation time of a step, or average of
549   // multiple step, when step < 0.
cpu_exec_micros(int64 step)550   int64 cpu_exec_micros(int64 step) const {
551     // Empty when no RunMetadata is provided.
552     if (execs_.empty()) {
553       return 0;
554     }
555     if (step >= 0) {
556       auto exec = execs_.find(step);
557       if (exec == execs_.end()) {
558         return 0;
559       }
560       return exec->second.cpu_exec_micros();
561     }
562 
563     int64 total_micros = 0;
564     for (const auto& exec : execs_) {
565       total_micros += exec.second.cpu_exec_micros();
566     }
567     return total_micros / execs_.size();
568   }
569 
requested_bytes(int64 step)570   int64 requested_bytes(int64 step) const { GRAPH_NODE_BYTES(requested); }
peak_bytes(int64 step)571   int64 peak_bytes(int64 step) const { GRAPH_NODE_BYTES(peak); }
residual_bytes(int64 step)572   int64 residual_bytes(int64 step) const { GRAPH_NODE_BYTES(residual); }
output_bytes(int64 step)573   int64 output_bytes(int64 step) const { GRAPH_NODE_BYTES(output); }
574 
all_start_micros(int64 step)575   int64 all_start_micros(int64 step) const {
576     auto exec = execs_.find(step);
577     if (exec == execs_.end()) {
578       return 0;
579     }
580     return exec->second.all_start_micros();
581   }
582 
latest_end_micros(int64 step)583   int64 latest_end_micros(int64 step) const {
584     auto exec = execs_.find(step);
585     if (exec == execs_.end()) {
586       return 0;
587     }
588     return exec->second.latest_end_micros();
589   }
590 
lastest_schedule_end_micros(int64 step)591   int64 lastest_schedule_end_micros(int64 step) const {
592     auto exec = execs_.find(step);
593     if (exec == execs_.end()) {
594       return 0;
595     }
596     return exec->second.lastest_schedule_end_micros();
597   }
598 
op_execs(int64 step)599   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
600       int64 step) const {
601     auto exec = execs_.find(step);
602     if (exec == execs_.end()) {
603       return empty_execs_;
604     }
605     return exec->second.op_execs();
606   }
cpu_execs(int64 step)607   const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs(
608       int64 step) const {
609     auto exec = execs_.find(step);
610     if (exec == execs_.end()) {
611       return empty_execs_;
612     }
613     return exec->second.cpu_execs();
614   }
615 
all_op_execs()616   const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
617 
accelerator_temp_bytes(int64 step)618   int64 accelerator_temp_bytes(int64 step) const {
619     auto exec = execs_.find(step);
620     if (exec == execs_.end()) {
621       return 0;
622     }
623     return exec->second.accelerator_temp_bytes();
624   }
host_temp_bytes(int64 step)625   int64 host_temp_bytes(int64 step) const {
626     auto exec = execs_.find(step);
627     if (exec == execs_.end()) {
628       return 0;
629     }
630     return exec->second.host_temp_bytes();
631   }
accelerator_persistent_bytes()632   int64 accelerator_persistent_bytes() const {
633     int64 persistent_bytes = 0;
634     for (const auto& exec : execs_) {
635       persistent_bytes = std::max(persistent_bytes,
636                                   exec.second.accelerator_persistent_bytes());
637     }
638     return persistent_bytes;
639   }
allocator_bytes_in_use(int64 step)640   const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
641     auto exec = execs_.find(step);
642     if (exec == execs_.end()) {
643       return empty_bytes_in_use_;
644     }
645     return exec->second.allocator_bytes_in_use();
646   }
647 
allocations(int64 step)648   const std::vector<AllocationRecord>& allocations(int64 step) const {
649     auto exec = execs_.find(step);
650     if (exec == execs_.end()) {
651       return empty_allocations_;
652     }
653     return exec->second.allocations();
654   }
655 
parameters()656   int64 parameters() const {
657     if (!shape().empty()) {
658       int64 params = 1;
659       bool complete_shape = true;
660       for (int64 d : shape()) {
661         // Sometimes parameters could be <0 when a dim is unknown.
662         if (d < 0) {
663           complete_shape = false;
664           break;
665         }
666         params *= d;
667       }
668       if (complete_shape) {
669         return params;
670       } else {
671         absl::FPrintF(stderr, "Incomplete shape.\n");
672       }
673     }
674     return 0;
675   }
676 
float_ops(int64 step)677   int64 float_ops(int64 step) const {
678     // If not run, return static analysis.
679     if (execs_.empty()) {
680       return node_.float_ops();
681     }
682     // Otherwise, return dynamic float_ops.
683     return node_.float_ops() * run_count(step);
684   }
call_stack()685   const CallStack* call_stack() { return call_stack_.get(); }
canonical_device()686   string canonical_device() const { return node_.canonical_device(); }
host_device()687   string host_device() const { return node_.host_device(); }
op_types()688   const std::set<string>& op_types() const { return op_types_; }
689 
op_attrs(const string & name)690   const AttrValue* op_attrs(const string& name) const {
691     const auto it = node_.attrs().find(name);
692     if (it == node_.attrs().end()) {
693       return nullptr;
694     }
695     return &it->second;
696   }
697 
shape()698   const std::vector<int64>& shape() const { return shape_; }
699 
output_shapes()700   const std::map<int, std::vector<int64>>& output_shapes() const {
701     return output_shapes_;
702   }
703 
input_shapes()704   const std::map<int, std::vector<int64>> input_shapes() const {
705     std::map<int, std::vector<int64>> input_shapes;
706     for (const auto& inp : inputs_) {
707       // Always create an empty vec even if the shape info might be missing.
708       std::vector<int64>& shape_vec = input_shapes[inp.first];
709       if (!nodes_map_) continue;
710       auto input_it = nodes_map_->find(inp.second);
711       if (input_it == nodes_map_->end()) continue;
712       auto output_it = src_output_idx_.find(inp.second);
713       if (output_it == src_output_idx_.end()) continue;
714 
715       const TFGraphNode* input_node = input_it->second.get();
716       if (!input_node) continue;
717       const auto& output_shapes = input_node->output_shapes();
718       const auto& output_shape = output_shapes.find(output_it->second);
719       if (output_shape == output_shapes.end()) continue;
720 
721       if (output_shape != input_node->output_shapes().end()) {
722         shape_vec.assign(output_shape->second.begin(),
723                          output_shape->second.end());
724       }
725     }
726     return input_shapes;
727   }
728 
729  private:
730   // maps graph node name to TFGraphNode. Not owned.
731   const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
732   // inputs to the node. input index -> input node name.
733   std::map<int, string> inputs_;
734   // The output index of the source node.
735   std::map<string, int32> src_output_idx_;
736   // proto for serialize/deserialized representation of the node.
737   ProfileNode node_;
738   // Python call stack that creates the name.
739   std::unique_ptr<CallStack> call_stack_;
740   // Shape of the node (e.g. Variable) if available.
741   std::vector<int64> shape_;
742   // Won't missing input_idx. But some shapes might be empty (unknown).
743   std::map<int, std::vector<int64>> input_shapes_;
744   // Could miss output_idx if no _output_shapes attr. some shapes can also
745   // be empty.
746   std::map<int, std::vector<int64>> output_shapes_;
747 
748   std::set<string> op_types_;
749 
750   std::map<int64, ExecStep> execs_;
751 
752   // Placeholder for empty cases.
753   std::map<int64, int64> empty_bytes_in_use_;
754   std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
755   std::vector<AllocationRecord> empty_allocations_;
756 };
757 
758 class TFMultiGraphNode {
759  public:
TFMultiGraphNode(const string & name)760   TFMultiGraphNode(const string& name)
761       : name_(name),
762         step_(-1),
763         run_count_(0),
764         exec_micros_(0),
765         accelerator_exec_micros_(0),
766         cpu_exec_micros_(0),
767         requested_bytes_(0),
768         peak_bytes_(0),
769         residual_bytes_(0),
770         output_bytes_(0),
771         float_ops_(0),
772         parameters_(0) {}
773 
SnapshotNodes(int64 step,const std::vector<string> & type_regexes)774   bool SnapshotNodes(int64 step, const std::vector<string>& type_regexes) {
775     run_count_ = 0;
776     exec_micros_ = 0;
777     accelerator_exec_micros_ = 0;
778     cpu_exec_micros_ = 0;
779 
780     requested_bytes_ = 0;
781     peak_bytes_ = 0;
782     residual_bytes_ = 0;
783     output_bytes_ = 0;
784 
785     float_ops_ = 0;
786     parameters_ = 0;
787     op_types_.clear();
788     shapes_.clear();
789     devices_.clear();
790     snapshot_nodes_.clear();
791 
792     step_ = step;
793     std::vector<const TFGraphNode*> nodes = pick_nodes(type_regexes);
794 
795     if (nodes.empty()) {
796       return (type_regexes.size() == 1 && type_regexes[0] == ".*");
797     }
798 
799     for (const TFGraphNode* node : nodes) {
800       op_types_.insert(node->op_types().begin(), node->op_types().end());
801 
802       run_count_ += node->run_count(step);
803       exec_micros_ += node->exec_micros(step);
804       accelerator_exec_micros_ += node->accelerator_exec_micros(step);
805       cpu_exec_micros_ += node->cpu_exec_micros(step);
806 
807       requested_bytes_ += node->requested_bytes(step);
808       peak_bytes_ += node->peak_bytes(step);
809       residual_bytes_ += node->residual_bytes(step);
810       output_bytes_ += node->output_bytes(step);
811 
812       float_ops_ += node->float_ops(step);
813       parameters_ += node->parameters();
814       if (node->shape().size() > 0) {
815         shapes_.push_back(node->shape());
816       }
817       devices_.insert(node->canonical_device());
818       snapshot_nodes_[node->name()] = node;
819     }
820     return true;
821   }
822 
step()823   int64 step() const { return step_; }
824 
AddGraphNode(const TFGraphNode * node)825   void AddGraphNode(const TFGraphNode* node) {
826     if (nodes_.find(node->name()) != nodes_.end()) {
827       return;
828     }
829     nodes_[node->name()] = node;
830   }
831 
graph_nodes()832   const std::map<string, const TFGraphNode*>& graph_nodes() const {
833     return snapshot_nodes_;
834   }
835 
name()836   const string& name() const { return name_; }
837 
run_count()838   int64 run_count() const { return run_count_; }
exec_micros()839   int64 exec_micros() const { return exec_micros_; }
accelerator_exec_micros()840   int64 accelerator_exec_micros() const { return accelerator_exec_micros_; }
cpu_exec_micros()841   int64 cpu_exec_micros() const { return cpu_exec_micros_; }
842 
requested_bytes()843   int64 requested_bytes() const { return requested_bytes_; }
peak_bytes()844   int64 peak_bytes() const { return peak_bytes_; }
residual_bytes()845   int64 residual_bytes() const { return residual_bytes_; }
output_bytes()846   int64 output_bytes() const { return output_bytes_; }
847 
float_ops()848   int64 float_ops() const { return float_ops_; }
849 
parameters()850   int64 parameters() const { return parameters_; }
851 
devices()852   const std::set<string>& devices() const { return devices_; }
853 
op_types()854   const std::set<string>& op_types() const { return op_types_; }
855 
shapes()856   const std::vector<std::vector<int64>>& shapes() const { return shapes_; }
857 
858  private:
pick_nodes(const std::vector<string> & type_regexes)859   std::vector<const TFGraphNode*> pick_nodes(
860       const std::vector<string>& type_regexes) {
861     if (type_regexes.empty()) {
862       return {};
863     }
864     std::vector<const TFGraphNode*> ret;
865     if (type_regexes.size() == 1 && type_regexes[0] == ".*") {
866       for (const auto& n : nodes_) {
867         ret.push_back(n.second);
868       }
869       return ret;
870     }
871 
872     for (const string& regex : type_regexes) {
873       for (const auto& n : nodes_) {
874         for (const string& type : n.second->op_types()) {
875           if (RE2::FullMatch(type, regex)) {
876             ret.push_back(n.second);
877             break;
878           }
879         }
880       }
881     }
882     return ret;
883   }
884 
885   const string name_;
886   int64 step_;
887   // Snapshot based on type_regexes
888   std::set<string> op_types_;
889   int64 run_count_;
890   int64 exec_micros_;
891   int64 accelerator_exec_micros_;
892   int64 cpu_exec_micros_;
893 
894   int64 requested_bytes_;
895   int64 peak_bytes_;
896   int64 residual_bytes_;
897   int64 output_bytes_;
898   int64 float_ops_;
899   int64 parameters_;
900   std::set<string> devices_;
901   std::vector<std::vector<int64>> shapes_;
902   std::map<string, const TFGraphNode*> snapshot_nodes_;
903 
904   // Overall data held by the TFMultiGraphNode.
905   std::map<string, const TFGraphNode*> nodes_;
906 };
907 
908 bool IsPlacedOnCPU(const string& device);
909 bool IsPlacedOnAccelerator(const string& device);
910 bool CountAsAcceleratorTime(const string& device);
911 bool CountAsCPUTime(const string& device);
912 bool IsCanonicalDevice(const string& device);
913 
914 }  // namespace tfprof
915 }  // namespace tensorflow
916 
917 #endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
918