1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
17
18 #include <string>
19
20 #include "google/protobuf/any.pb.h"
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/platform/types.h"
23 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
24 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
25 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
26 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
27 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
30 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
31 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
32 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
33 #include "tensorflow/core/profiler/utils/diagnostics.h"
34 #include "tensorflow/core/profiler/utils/format_utils.h"
35 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
36 #include "tensorflow/core/profiler/utils/html_utils.h"
37 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
38 #include "tensorflow/core/profiler/utils/math_utils.h"
39 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
40 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
41 #include "tensorflow/core/profiler/utils/time_utils.h"
42
43 namespace tensorflow {
44 namespace profiler {
45
46 namespace {
47
48 // If the use of low-precision ops is less than this percentage threshold, a
49 // statement of suggestion will be made.
50 constexpr double kLowPrecisionPercentThreshold = 10;
51
52 struct TfFunctionInfo {
53 absl::string_view function_name;
54 double expensive_call_percent;
55 };
56
MakeOverviewPageTip(std::string text)57 OverviewPageTip MakeOverviewPageTip(std::string text) {
58 OverviewPageTip tip;
59 tip.set_link(std::move(text));
60 return tip;
61 }
62
63 // Makes a recommendation for looking up a document.
64 // doc_url is expected to be already be escaped suitably for use in an HTML
65 // attribute.
MakeOverviewPageTipDocLink(absl::string_view doc_url,absl::string_view text)66 OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
67 absl::string_view text) {
68 return MakeOverviewPageTip(AnchorElement(doc_url, text));
69 }
70
ComputeHostTips(OverviewPageRecommendation * re)71 void ComputeHostTips(OverviewPageRecommendation* re) {
72 *re->add_host_tips() = MakeOverviewPageTip(
73 "input_pipeline_analyzer (especially Section 3 for the breakdown of "
74 "input operations on the Host)");
75 *re->add_host_tips() = MakeOverviewPageTip(
76 "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
77 "pipeline)");
78 *re->add_host_tips() = MakeOverviewPageTip(
79 "trace_viewer (look at the activities on the timeline of each Host "
80 "Thread near the bottom of the trace view)");
81 }
82
ComputeDeviceTips(HardwareType hardware_type,OverviewPageRecommendation * re)83 void ComputeDeviceTips(HardwareType hardware_type,
84 OverviewPageRecommendation* re) {
85 absl::string_view device_name = HardwareType_Name(hardware_type);
86 absl::string_view timeline_name = device_name;
87 absl::string_view op_stats_toolname = "tensorflow_stats";
88 if (hardware_type == tensorflow::profiler::TPU) {
89 timeline_name = "TPU core";
90 op_stats_toolname = "op_profile";
91 }
92 *re->add_device_tips() = MakeOverviewPageTip(
93 absl::StrCat(op_stats_toolname,
94 " (identify the time-consuming operations "
95 "executed on the ",
96 device_name, ")"));
97 *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
98 "trace_viewer (look at the activities on the timeline of each ",
99 timeline_name, " in the trace view)"));
100 }
101
ComputeFaqTips(OverviewPageRecommendation * re)102 void ComputeFaqTips(OverviewPageRecommendation* re) {
103 *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
104 }
105
ComputeDocumentationTips(OverviewPageRecommendation * re)106 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
107 *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
108 "https://www.tensorflow.org/guide/data_performance_analysis",
109 "Analyze tf.data performance with the TF Profiler");
110 *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
111 "https://www.tensorflow.org/guide/"
112 "data_performance",
113 "Better performance with the tf.data API");
114 }
115
GeneratePrecisionStatement(const PrecisionStats & precision_stats)116 std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
117 uint64 total_compute_ps =
118 precision_stats.compute_16bit_ps() + precision_stats.compute_32bit_ps();
119 if (total_compute_ps > 0) {
120 double percent_16bit =
121 (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
122 if (percent_16bit < kLowPrecisionPercentThreshold) {
123 return absl::StrCat(
124 "Only ", OneDigit(percent_16bit),
125 "% of device computation is 16 bit. So you might want to replace "
126 "more 32-bit Ops by 16-bit Ops to improve performance (if the "
127 "reduced accuracy is acceptable).");
128 }
129 }
130 return "";
131 }
132
133 } // namespace
134
SetCommonRecommendation(absl::string_view input_classification,absl::string_view input_statement,absl::string_view output_statement,HardwareType hardware_type,absl::string_view tf_function_statement_html,absl::string_view eager_statement_html,absl::string_view outside_compilation_statement_html,OverviewPageRecommendation * re)135 void SetCommonRecommendation(
136 absl::string_view input_classification, absl::string_view input_statement,
137 absl::string_view output_statement, HardwareType hardware_type,
138 absl::string_view tf_function_statement_html,
139 absl::string_view eager_statement_html,
140 absl::string_view outside_compilation_statement_html,
141 OverviewPageRecommendation* re) {
142 re->set_bottleneck(std::string(input_classification));
143 re->set_statement(std::string(input_statement));
144 re->set_output_statement(std::string(output_statement));
145 re->set_tf_function_statement_html(std::string(tf_function_statement_html));
146 re->set_eager_statement_html(std::string(eager_statement_html));
147 re->set_outside_compilation_statement_html(
148 std::string(outside_compilation_statement_html));
149 ComputeHostTips(re);
150 ComputeDeviceTips(hardware_type, re);
151 ComputeDocumentationTips(re);
152 ComputeFaqTips(re);
153 }
154
ComputeGenericRecommendation(const BottleneckAnalysis & bottleneck,const PrecisionStats & precision_stats)155 OverviewPageRecommendation ComputeGenericRecommendation(
156 const BottleneckAnalysis& bottleneck,
157 const PrecisionStats& precision_stats) {
158 OverviewPageRecommendation re;
159 GenericRecommendation generic;
160 generic.set_device_collectives_bottleneck(
161 bottleneck.device_collectives_classification());
162 generic.set_device_collectives_statement(
163 bottleneck.device_collectives_statement());
164 generic.set_kernel_launch_bottleneck(
165 bottleneck.kernel_launch_classification());
166 generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
167 generic.set_all_other_bottleneck(bottleneck.all_other_classification());
168 generic.set_all_other_statement(bottleneck.all_other_statement());
169 generic.set_precision_statement(GeneratePrecisionStatement(precision_stats));
170 re.mutable_recommendation()->PackFrom(generic);
171 return re;
172 }
173
ComputeAnalysisResult(const OpStats & op_stats)174 OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
175 OverviewPageAnalysis analysis;
176 OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
177 op_stats.device_op_metrics_db(), /*with_idle=*/false);
178 KernelStatsByOpName kernel_stats_by_op_name =
179 GroupKernelReportsByOpName(op_stats.kernel_stats_db());
180 uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
181 constexpr int kNumTopOpsShown = 10;
182 double device_cumulative_fraction = 0.0;
183 for (const OpMetrics* metrics :
184 SortedOpMetricsDb(device_tf_op_metrics_db, kNumTopOpsShown)) {
185 OverviewTfOp* op = analysis.add_top_device_ops();
186 op->set_name(metrics->name());
187 op->set_category(metrics->category());
188 op->set_self_time_fraction(
189 SafeDivide(metrics->self_time_ps(), total_device_time_ps));
190 device_cumulative_fraction += op->self_time_fraction();
191 op->set_cumulative_time_fraction(device_cumulative_fraction);
192 op->set_flop_rate(
193 SafeDivide(metrics->flops(), PicosToNanos(metrics->time_ps())));
194 auto iter = kernel_stats_by_op_name.find(op->name());
195 if (iter != kernel_stats_by_op_name.end()) {
196 op->set_is_op_tensorcore_eligible(
197 iter->second.is_op_tensor_core_eligible);
198 op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
199 }
200 }
201 uint64 total_device_compute_ps =
202 op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
203 op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
204 analysis.set_device_compute_16bit_percent(
205 100.0 *
206 SafeDivide(
207 op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
208 total_device_compute_ps));
209 analysis.set_device_compute_32bit_percent(
210 100.0 *
211 SafeDivide(
212 op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
213 total_device_compute_ps));
214
215 uint64 num_host_tf_ops = 0;
216 uint64 total_host_op_time_ps_exclude_idle = 0;
217 uint64 eager_host_op_time_ps = 0;
218 for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
219 num_host_tf_ops += metrics.occurrences();
220 if (!IsIdleOp(metrics)) {
221 total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
222 if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
223 }
224 }
225 uint64 num_device_tf_ops = 0;
226 uint64 total_device_op_time_ps_exclude_idle = 0;
227 uint64 eager_device_op_time_ps = 0;
228 for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
229 num_device_tf_ops += metrics.occurrences();
230 if (!IsIdleOp(metrics)) {
231 total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
232 if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
233 }
234 }
235 // Figures out outside_compilation time from
236 // op_stats.device_op_metrics_db().metrics_db(). We don't use the
237 // {metrics.provenance(), metrics.name()} from
238 // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
239 // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
240 // will confuse IsOutsideCompilationOp().
241 uint64 outside_compilation_device_op_time_ps = 0;
242 for (const OpMetrics& metrics :
243 op_stats.device_op_metrics_db().metrics_db()) {
244 if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
245 continue;
246 outside_compilation_device_op_time_ps += metrics.self_time_ps();
247 }
248 uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
249 analysis.set_host_tf_op_percent(
250 100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
251 analysis.set_device_tf_op_percent(
252 100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
253 analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
254 analysis.set_host_op_time_eager_percent(
255 100.0 *
256 SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
257 analysis.set_device_op_time_eager_percent(
258 100.0 * SafeDivide(eager_device_op_time_ps,
259 total_device_op_time_ps_exclude_idle));
260 analysis.set_device_op_time_outside_compilation_percent(
261 100.0 * SafeDivide(outside_compilation_device_op_time_ps,
262 total_device_op_time_ps_exclude_idle));
263 return analysis;
264 }
265
266 // Converts from HostIndependentJobInfo to OverviewPageHostIndependentJobInfo.
ToOverviewPageHostIndependentJobInfo(const HostIndependentJobInfoResult & host_independent_job_info)267 OverviewPageHostIndependentJobInfo ToOverviewPageHostIndependentJobInfo(
268 const HostIndependentJobInfoResult& host_independent_job_info) {
269 OverviewPageHostIndependentJobInfo result;
270 result.set_change_list(host_independent_job_info.change_list());
271 result.set_build_time(host_independent_job_info.build_time());
272 result.set_build_target(host_independent_job_info.build_target());
273 result.set_profile_duration_ms(
274 host_independent_job_info.profile_duration_ms());
275 return result;
276 }
277
278 // Converts from HostDependentJobInfo to OverviewPageHostDependentJobInfo.
ToOverviewPageHostDependentJobInfo(const HostDependentJobInfoResult & host_dependent_job_info)279 OverviewPageHostDependentJobInfo ToOverviewPageHostDependentJobInfo(
280 const HostDependentJobInfoResult& host_dependent_job_info) {
281 OverviewPageHostDependentJobInfo result;
282 result.set_host_id(host_dependent_job_info.host_id());
283 result.set_command_line(host_dependent_job_info.command_line());
284 result.set_start_time(host_dependent_job_info.start_time());
285 result.set_bns_address(host_dependent_job_info.bns_address());
286 result.set_profile_time_ns(host_dependent_job_info.profile_time_ns());
287 return result;
288 }
289
ComputeRunEnvironment(const RunEnvironment & run_environment)290 OverviewPageRunEnvironment ComputeRunEnvironment(
291 const RunEnvironment& run_environment) {
292 OverviewPageRunEnvironment re;
293 re.set_host_count(run_environment.host_count());
294 re.set_task_count(run_environment.task_count());
295 re.set_device_type(run_environment.device_type());
296 re.set_device_core_count(run_environment.device_core_count());
297 re.set_per_core_batch_size(run_environment.per_core_batch_size());
298 re.set_replica_count(run_environment.replica_count());
299 re.set_num_cores_per_replica(run_environment.num_cores_per_replica());
300 *re.mutable_host_independent_job_info() =
301 ToOverviewPageHostIndependentJobInfo(
302 run_environment.host_independent_job_info());
303 for (const auto& host_dependent_job_info :
304 run_environment.host_dependent_job_info()) {
305 *re.add_host_dependent_job_info() =
306 ToOverviewPageHostDependentJobInfo(host_dependent_job_info);
307 }
308 return re;
309 }
310
TfFunctionRecommendationHtml(const TfFunctionDb & tf_function_db)311 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
312 std::vector<TfFunctionInfo> candidates;
313 for (const auto& name_fun : tf_function_db.tf_functions()) {
314 const auto& fun = name_fun.second;
315 if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
316 candidates.push_back({name_fun.first, fun.expensive_call_percent()});
317 }
318 }
319 if (candidates.empty()) return "";
320 auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
321 return a.expensive_call_percent > b.expensive_call_percent;
322 };
323 // Sorts candidates in descending order of expensive_call_percent.
324 absl::c_sort(candidates, cmp);
325 std::string expensive_functions = "";
326 auto num_functions_shown = std::min(
327 static_cast<decltype(candidates)::size_type>(3), candidates.size());
328
329 for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
330 if (i > 0) absl::StrAppend(&expensive_functions, ", ");
331 absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
332 "\"");
333 }
334 if (candidates.size() > num_functions_shown)
335 absl::StrAppend(&expensive_functions, " and more");
336 return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
337 ") due to either retracing or eager execution.");
338 }
339
EagerRecommendationHtml(double host_op_time_eager_percent,double device_op_time_eager_percent)340 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
341 double device_op_time_eager_percent) {
342 std::string recommendation = "";
343 if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
344 absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
345 "% of Op time on the host used eager execution. ");
346 if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
347 absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
348 "% of Op time on the device used eager execution. ");
349 if (!recommendation.empty())
350 absl::StrAppend(&recommendation, "Performance could be improved with ",
351 AnchorElement("https://www.tensorflow.org/guide/function",
352 "tf.function."));
353 return recommendation;
354 }
355
OutsideCompilationRecommendationHtml(double device_op_time_outside_compilation_percent)356 std::string OutsideCompilationRecommendationHtml(
357 double device_op_time_outside_compilation_percent) {
358 if (device_op_time_outside_compilation_percent <=
359 kOutsideCompilationThresholdInPercent)
360 return "";
361 return absl::StrCat(
362 OneDigit(device_op_time_outside_compilation_percent),
363 " % of Op time on the device are for outside compilation. Performance "
364 "could be improved by avoiding outside compilation.");
365 }
366
ConvertOpStatsToOverviewPage(const OpStats & op_stats)367 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
368 OverviewPage overview_page;
369 *overview_page.mutable_run_environment() =
370 ComputeRunEnvironment(op_stats.run_environment());
371 *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
372 *overview_page.mutable_input_analysis() =
373 ConvertOpStatsToInputPipelineAnalysis(op_stats);
374 BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
375 overview_page.input_analysis().input_time_breakdown(),
376 overview_page.input_analysis().step_details());
377 *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
378 bottleneck, op_stats.device_op_metrics_db().precision_stats());
379 SetCommonRecommendation(
380 bottleneck.input_classification(), bottleneck.input_statement(), "",
381 ParseHardwareType(op_stats.run_environment().device_type()),
382 TfFunctionRecommendationHtml(op_stats.tf_function_db()),
383 EagerRecommendationHtml(
384 overview_page.analysis().host_op_time_eager_percent(),
385 overview_page.analysis().device_op_time_eager_percent()),
386 OutsideCompilationRecommendationHtml(
387 overview_page.analysis()
388 .device_op_time_outside_compilation_percent()),
389 overview_page.mutable_recommendation());
390 PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
391 return overview_page;
392 }
393
394 } // namespace profiler
395 } // namespace tensorflow
396