1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/util/stat_summarizer.h"
17 
18 #include <iomanip>
19 #include <map>
20 #include <queue>
21 #include <sstream>
22 #include <string>
23 
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/framework/tensor_description.pb.h"
26 #include "tensorflow/core/framework/tensor_shape.pb.h"
27 #include "tensorflow/core/lib/strings/str_util.h"
28 #include "tensorflow/core/platform/env.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/types.h"
31 
32 namespace tensorflow {
33 
34 using Detail = StatsCalculator::Detail;
35 
StatSummarizer(const StatSummarizerOptions & options)36 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
37     : stats_calculator_(new StatsCalculator(options)) {}
38 
StatSummarizer(const tensorflow::GraphDef & tensorflow_graph)39 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
40     : stats_calculator_(new StatsCalculator(StatSummarizerOptions())) {}
41 
~StatSummarizer()42 StatSummarizer::~StatSummarizer() {}
43 
Validate(const std::vector<TensorDescription> * outputs,const NodeExecStats & ns) const44 void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
45                               const NodeExecStats& ns) const {
46   if (outputs->size() != ns.output_size()) {
47     LOG(WARNING) << "Number of outputs changed between runs for '"
48                  << ns.node_name() << "' - was " << outputs->size() << ", now "
49                  << ns.output_size();
50   } else {
51     for (const auto& output : ns.output()) {
52       const int32 slot = output.slot();
53       if ((slot < 0) || (slot >= ns.output_size())) {
54         // This is not a hard error for Switch ops, so just pass.
55         continue;
56       }
57       const auto& stored = (*outputs)[slot];
58       const auto& current = output.tensor_description();
59 
60       bool do_tensors_match =
61           (stored.dtype() == current.dtype()) &&
62           (stored.shape().dim_size() == current.shape().dim_size());
63 
64       if (do_tensors_match) {
65         for (int i = 0; i < stored.shape().dim_size(); ++i) {
66           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
67             do_tensors_match = false;
68             break;
69           }
70         }
71       }
72 
73       if (!do_tensors_match) {
74         LOG(WARNING) << "Output tensor changed between runs for '"
75                      << ns.node_name();
76       }
77     }
78   }
79 }
80 
PrintStepStats() const81 void StatSummarizer::PrintStepStats() const {
82   string output = GetOutputString();
83   std::istringstream iss(output);
84   for (std::string line; std::getline(iss, line);) {
85     LOG(INFO) << line;
86   }
87 }
88 
89 namespace {
OpType(const DeviceStepStats & ds,const NodeExecStats & ns)90 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
91   // There is no published specification of how DeviceStats and NodeStats
92   // are filled in. Thus, we live with the fragility of this implementation.
93   //
94   // Note that NodeStats.node_name may NOT refer to a node in the Graph.
95   // This can happen if, either:
96   // (1) The DeviceStats corresponds to statistics from the GPUTracer
97   //     logging (which adds devices whose name contains either "/stream"
98   //     or "/memcpy" to the StepStats), OR
99   // (2) The graph was partitioned, and thus the NodeStats refers to
100   //     the SendTensor or RecvTensor operations added.
101   // For these cases, return "<>" as the "type" of the operation.
102   //
103   // The StatSummarizer was initially aimed at CPU execution on mobile, where
104   // there was no GPUTracing and no graph partitioning, so the conditions above
105   // do not occur.
106   //
107   // It would be nice to have a clearer spec for StepStats so utilities such as
108   // this class can handle nodes that do not appear in the original graph
109   // gracefully. Till then, duplicate what is done by:
110   // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py
111   // and rely on the unittest.
112   if (ds.device().find("/stream") != std::string::npos ||
113       ds.device().find("/memcpy") != std::string::npos) {
114     // Stats from the GPUTracer, does not correspond to TensorFlow ops.
115     return "<>";
116   }
117   // timeline_label should be of the format: <node_name> = <op_type>(<args>)
118   // Extract <op_type>.
119   const std::string sep(" = ");
120   const std::string& label = ns.timeline_label();
121   std::string::size_type start = label.find(sep);
122   if (start == std::string::npos) return "<>";
123   start += sep.size();
124   std::string::size_type end = label.find("(", start);
125   if (end == std::string::npos) return "<>";
126   return label.substr(start, end - start);
127 }
128 }  // namespace
129 
ProcessStepStats(const StepStats & step_stats)130 void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
131   int64 curr_total_us = 0;
132   int64 mem_total = 0;
133 
134   int64 first_node_start_us =
135       step_stats.dev_stats(0).node_stats(0).all_start_micros();
136 
137   int node_num = 0;
138   for (const auto& ds : step_stats.dev_stats()) {
139     for (const auto& ns : ds.node_stats()) {
140       // NOTE(blackhc): To better support GPUs:
141       // GPU kernels are duplicated both in /stream:all and their
142       // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
143       // /stream:$index. So only keep /stream:all and /memcpy and ignore all
144       // /stream:$index to only count GPU executions once.
145       if (ds.device().find("/stream") != std::string::npos &&
146           ds.device().find("/stream:all") == std::string::npos) {
147         continue;
148       }
149 
150       std::string name = ns.node_name();
151       std::string op_type = "<>";
152       // NOTE(blackhc): we have to ensure that all keys into the detail map
153       // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
154       // To make the node type summary work better, we prefix "gpu:" to
155       // the op type when the info is from a /gpu/stream or /memcpy channel.
156       if (ds.device().find("/stream") != std::string::npos) {
157         // node_name: name ":" opType
158         auto parts = str_util::Split(ns.node_name(), ':');
159         if (parts.size() == 2) {
160           name = parts[0] + " [Kernel]";
161           op_type = "gpu:" + parts[1];
162         }
163       } else if (ds.device().find("/memcpy") != std::string::npos) {
164         // node_name: name (":" opType)? ":" memCpyType
165         auto parts = str_util::Split(ns.node_name(), ':');
166         if (parts.size() == 2 || parts.size() == 3) {
167           name = parts.front() + " [MemCpy]";
168           // We don't care about the actual op type (it might not be available
169           // for edge_ memcpys). We only care that it's a memcpy for now.
170           op_type = "gpu:" + parts.back();
171         }
172       } else {
173         op_type = OpType(ds, ns);
174       }
175 
176       ++node_num;
177       const int64 curr_time = ns.all_end_rel_micros();
178       curr_total_us += curr_time;
179       auto output_result =
180           outputs_.emplace(name, std::vector<TensorDescription>());
181       std::vector<TensorDescription>* outputs = &(output_result.first->second);
182 
183       int64_t start_us = (ns.all_start_micros() - first_node_start_us);
184       int64_t rel_end_us = curr_time;
185 
186       // If this is the first pass, initialize some values.
187       if (output_result.second) {
188         outputs->resize(ns.output_size());
189         for (const auto& output : ns.output()) {
190           const int32 slot = output.slot();
191           if ((slot < 0) || (slot >= ns.output_size())) {
192             // This is not a hard error for Switch ops, so just pass.
193             continue;
194           }
195           (*outputs)[slot] = output.tensor_description();
196         }
197       }
198 
199       int64 curr_node_mem = 0;
200       for (const auto& mem : ns.memory()) {
201         const int64 mem_usage = mem.total_bytes();
202         curr_node_mem += mem_usage;
203       }
204       stats_calculator_->AddNodeStats(name, op_type, node_num, start_us,
205                                       rel_end_us, curr_node_mem);
206 
207       mem_total += curr_node_mem;
208 
209       Validate(outputs, ns);
210     }
211   }
212 
213   stats_calculator_->UpdateRunTotalUs(curr_total_us);
214   stats_calculator_->UpdateMemoryUsed(mem_total);
215 }
216 
217 
PrintOutputs() const218 void StatSummarizer::PrintOutputs() const {
219   std::priority_queue<
220       std::pair<int64, const std::pair<const std::string, Detail>*>>
221       timings;
222   for (const auto& entry : stats_calculator_->GetDetails()) {
223     timings.emplace(-entry.second.start_us.avg(), &entry);
224   }
225 
226   LOG(INFO) << "============ Node output tensor sizes in run order ========";
227   while (!timings.empty()) {
228     auto entry = timings.top();
229     timings.pop();
230     std::stringstream stream;
231     const auto detail_outputs = outputs_.at(entry.second->first);
232     stream << entry.second->first << "\t" << detail_outputs.size();
233     for (const auto& tensor : detail_outputs) {
234       stream << "\t" << DataTypeString(tensor.dtype());
235       stream << "\t" << tensor.shape().dim_size();
236       for (const auto& d : tensor.shape().dim()) {
237         stream << "\t" << d.size();
238       }
239     }
240     LOG(INFO) << stream.str();
241   }
242 }
243 
244 }  // namespace tensorflow
245