1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
17 
18 #include <string>
19 
20 #include "google/protobuf/any.pb.h"
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/platform/types.h"
23 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
24 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
25 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
26 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
27 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
30 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
31 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
32 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
33 #include "tensorflow/core/profiler/utils/diagnostics.h"
34 #include "tensorflow/core/profiler/utils/format_utils.h"
35 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
36 #include "tensorflow/core/profiler/utils/html_utils.h"
37 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
38 #include "tensorflow/core/profiler/utils/math_utils.h"
39 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
40 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
41 #include "tensorflow/core/profiler/utils/time_utils.h"
42 
43 namespace tensorflow {
44 namespace profiler {
45 
46 namespace {
47 
48 // If the use of low-precision ops is less than this percentage threshold, a
49 // statement of suggestion will be made.
50 constexpr double kLowPrecisionPercentThreshold = 10;
51 
52 struct TfFunctionInfo {
53   absl::string_view function_name;
54   double expensive_call_percent;
55 };
56 
MakeOverviewPageTip(std::string text)57 OverviewPageTip MakeOverviewPageTip(std::string text) {
58   OverviewPageTip tip;
59   tip.set_link(std::move(text));
60   return tip;
61 }
62 
63 // Makes a recommendation for looking up a document.
64 // doc_url is expected to be already be escaped suitably for use in an HTML
65 // attribute.
MakeOverviewPageTipDocLink(absl::string_view doc_url,absl::string_view text)66 OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
67                                            absl::string_view text) {
68   return MakeOverviewPageTip(AnchorElement(doc_url, text));
69 }
70 
ComputeHostTips(OverviewPageRecommendation * re)71 void ComputeHostTips(OverviewPageRecommendation* re) {
72   *re->add_host_tips() = MakeOverviewPageTip(
73       "input_pipeline_analyzer (especially Section 3 for the breakdown of "
74       "input operations on the Host)");
75   *re->add_host_tips() = MakeOverviewPageTip(
76       "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
77       "pipeline)");
78   *re->add_host_tips() = MakeOverviewPageTip(
79       "trace_viewer (look at the activities on the timeline of each Host "
80       "Thread near the bottom of the trace view)");
81 }
82 
ComputeDeviceTips(HardwareType hardware_type,OverviewPageRecommendation * re)83 void ComputeDeviceTips(HardwareType hardware_type,
84                        OverviewPageRecommendation* re) {
85   absl::string_view device_name = HardwareType_Name(hardware_type);
86   absl::string_view timeline_name = device_name;
87   absl::string_view op_stats_toolname = "tensorflow_stats";
88   if (hardware_type == tensorflow::profiler::TPU) {
89     timeline_name = "TPU core";
90     op_stats_toolname = "op_profile";
91   }
92   *re->add_device_tips() = MakeOverviewPageTip(
93       absl::StrCat(op_stats_toolname,
94                    " (identify the time-consuming operations "
95                    "executed on the ",
96                    device_name, ")"));
97   *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
98       "trace_viewer (look at the activities on the timeline of each ",
99       timeline_name, " in the trace view)"));
100 }
101 
ComputeFaqTips(OverviewPageRecommendation * re)102 void ComputeFaqTips(OverviewPageRecommendation* re) {
103   *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
104 }
105 
ComputeDocumentationTips(OverviewPageRecommendation * re)106 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
107   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
108       "https://www.tensorflow.org/guide/data_performance_analysis",
109       "Analyze tf.data performance with the TF Profiler");
110   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
111       "https://www.tensorflow.org/guide/"
112       "data_performance",
113       "Better performance with the tf.data API");
114 }
115 
GeneratePrecisionStatement(const PrecisionStats & precision_stats)116 std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
117   uint64 total_compute_ps =
118       precision_stats.compute_16bit_ps() + precision_stats.compute_32bit_ps();
119   if (total_compute_ps > 0) {
120     double percent_16bit =
121         (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
122     if (percent_16bit < kLowPrecisionPercentThreshold) {
123       return absl::StrCat(
124           "Only ", OneDigit(percent_16bit),
125           "% of device computation is 16 bit. So you might want to replace "
126           "more 32-bit Ops by 16-bit Ops to improve performance (if the "
127           "reduced accuracy is acceptable).");
128     }
129   }
130   return "";
131 }
132 
133 }  // namespace
134 
SetCommonRecommendation(absl::string_view input_classification,absl::string_view input_statement,absl::string_view output_statement,HardwareType hardware_type,absl::string_view tf_function_statement_html,absl::string_view eager_statement_html,absl::string_view outside_compilation_statement_html,OverviewPageRecommendation * re)135 void SetCommonRecommendation(
136     absl::string_view input_classification, absl::string_view input_statement,
137     absl::string_view output_statement, HardwareType hardware_type,
138     absl::string_view tf_function_statement_html,
139     absl::string_view eager_statement_html,
140     absl::string_view outside_compilation_statement_html,
141     OverviewPageRecommendation* re) {
142   re->set_bottleneck(std::string(input_classification));
143   re->set_statement(std::string(input_statement));
144   re->set_output_statement(std::string(output_statement));
145   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
146   re->set_eager_statement_html(std::string(eager_statement_html));
147   re->set_outside_compilation_statement_html(
148       std::string(outside_compilation_statement_html));
149   ComputeHostTips(re);
150   ComputeDeviceTips(hardware_type, re);
151   ComputeDocumentationTips(re);
152   ComputeFaqTips(re);
153 }
154 
ComputeGenericRecommendation(const BottleneckAnalysis & bottleneck,const PrecisionStats & precision_stats)155 OverviewPageRecommendation ComputeGenericRecommendation(
156     const BottleneckAnalysis& bottleneck,
157     const PrecisionStats& precision_stats) {
158   OverviewPageRecommendation re;
159   GenericRecommendation generic;
160   generic.set_device_collectives_bottleneck(
161       bottleneck.device_collectives_classification());
162   generic.set_device_collectives_statement(
163       bottleneck.device_collectives_statement());
164   generic.set_kernel_launch_bottleneck(
165       bottleneck.kernel_launch_classification());
166   generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
167   generic.set_all_other_bottleneck(bottleneck.all_other_classification());
168   generic.set_all_other_statement(bottleneck.all_other_statement());
169   generic.set_precision_statement(GeneratePrecisionStatement(precision_stats));
170   re.mutable_recommendation()->PackFrom(generic);
171   return re;
172 }
173 
ComputeAnalysisResult(const OpStats & op_stats)174 OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
175   OverviewPageAnalysis analysis;
176   OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
177       op_stats.device_op_metrics_db(), /*with_idle=*/false);
178   KernelStatsByOpName kernel_stats_by_op_name =
179       GroupKernelReportsByOpName(op_stats.kernel_stats_db());
180   uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
181   constexpr int kNumTopOpsShown = 10;
182   double device_cumulative_fraction = 0.0;
183   for (const OpMetrics* metrics :
184        SortedOpMetricsDb(device_tf_op_metrics_db, kNumTopOpsShown)) {
185     OverviewTfOp* op = analysis.add_top_device_ops();
186     op->set_name(metrics->name());
187     op->set_category(metrics->category());
188     op->set_self_time_fraction(
189         SafeDivide(metrics->self_time_ps(), total_device_time_ps));
190     device_cumulative_fraction += op->self_time_fraction();
191     op->set_cumulative_time_fraction(device_cumulative_fraction);
192     op->set_flop_rate(
193         SafeDivide(metrics->flops(), PicosToNanos(metrics->time_ps())));
194     auto iter = kernel_stats_by_op_name.find(op->name());
195     if (iter != kernel_stats_by_op_name.end()) {
196       op->set_is_op_tensorcore_eligible(
197           iter->second.is_op_tensor_core_eligible);
198       op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
199     }
200   }
201   uint64 total_device_compute_ps =
202       op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
203       op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
204   analysis.set_device_compute_16bit_percent(
205       100.0 *
206       SafeDivide(
207           op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
208           total_device_compute_ps));
209   analysis.set_device_compute_32bit_percent(
210       100.0 *
211       SafeDivide(
212           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
213           total_device_compute_ps));
214 
215   uint64 num_host_tf_ops = 0;
216   uint64 total_host_op_time_ps_exclude_idle = 0;
217   uint64 eager_host_op_time_ps = 0;
218   for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
219     num_host_tf_ops += metrics.occurrences();
220     if (!IsIdleOp(metrics)) {
221       total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
222       if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
223     }
224   }
225   uint64 num_device_tf_ops = 0;
226   uint64 total_device_op_time_ps_exclude_idle = 0;
227   uint64 eager_device_op_time_ps = 0;
228   for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
229     num_device_tf_ops += metrics.occurrences();
230     if (!IsIdleOp(metrics)) {
231       total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
232       if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
233     }
234   }
235   // Figures out outside_compilation time from
236   // op_stats.device_op_metrics_db().metrics_db(). We don't use the
237   // {metrics.provenance(), metrics.name()} from
238   // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
239   // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
240   // will confuse IsOutsideCompilationOp().
241   uint64 outside_compilation_device_op_time_ps = 0;
242   for (const OpMetrics& metrics :
243        op_stats.device_op_metrics_db().metrics_db()) {
244     if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
245       continue;
246     outside_compilation_device_op_time_ps += metrics.self_time_ps();
247   }
248   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
249   analysis.set_host_tf_op_percent(
250       100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
251   analysis.set_device_tf_op_percent(
252       100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
253   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
254   analysis.set_host_op_time_eager_percent(
255       100.0 *
256       SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
257   analysis.set_device_op_time_eager_percent(
258       100.0 * SafeDivide(eager_device_op_time_ps,
259                          total_device_op_time_ps_exclude_idle));
260   analysis.set_device_op_time_outside_compilation_percent(
261       100.0 * SafeDivide(outside_compilation_device_op_time_ps,
262                          total_device_op_time_ps_exclude_idle));
263   return analysis;
264 }
265 
266 // Converts from HostIndependentJobInfo to OverviewPageHostIndependentJobInfo.
ToOverviewPageHostIndependentJobInfo(const HostIndependentJobInfoResult & host_independent_job_info)267 OverviewPageHostIndependentJobInfo ToOverviewPageHostIndependentJobInfo(
268     const HostIndependentJobInfoResult& host_independent_job_info) {
269   OverviewPageHostIndependentJobInfo result;
270   result.set_change_list(host_independent_job_info.change_list());
271   result.set_build_time(host_independent_job_info.build_time());
272   result.set_build_target(host_independent_job_info.build_target());
273   result.set_profile_duration_ms(
274       host_independent_job_info.profile_duration_ms());
275   return result;
276 }
277 
278 // Converts from HostDependentJobInfo to OverviewPageHostDependentJobInfo.
ToOverviewPageHostDependentJobInfo(const HostDependentJobInfoResult & host_dependent_job_info)279 OverviewPageHostDependentJobInfo ToOverviewPageHostDependentJobInfo(
280     const HostDependentJobInfoResult& host_dependent_job_info) {
281   OverviewPageHostDependentJobInfo result;
282   result.set_host_id(host_dependent_job_info.host_id());
283   result.set_command_line(host_dependent_job_info.command_line());
284   result.set_start_time(host_dependent_job_info.start_time());
285   result.set_bns_address(host_dependent_job_info.bns_address());
286   result.set_profile_time_ns(host_dependent_job_info.profile_time_ns());
287   return result;
288 }
289 
ComputeRunEnvironment(const RunEnvironment & run_environment)290 OverviewPageRunEnvironment ComputeRunEnvironment(
291     const RunEnvironment& run_environment) {
292   OverviewPageRunEnvironment re;
293   re.set_host_count(run_environment.host_count());
294   re.set_task_count(run_environment.task_count());
295   re.set_device_type(run_environment.device_type());
296   re.set_device_core_count(run_environment.device_core_count());
297   re.set_per_core_batch_size(run_environment.per_core_batch_size());
298   re.set_replica_count(run_environment.replica_count());
299   re.set_num_cores_per_replica(run_environment.num_cores_per_replica());
300   *re.mutable_host_independent_job_info() =
301       ToOverviewPageHostIndependentJobInfo(
302           run_environment.host_independent_job_info());
303   for (const auto& host_dependent_job_info :
304        run_environment.host_dependent_job_info()) {
305     *re.add_host_dependent_job_info() =
306         ToOverviewPageHostDependentJobInfo(host_dependent_job_info);
307   }
308   return re;
309 }
310 
TfFunctionRecommendationHtml(const TfFunctionDb & tf_function_db)311 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
312   std::vector<TfFunctionInfo> candidates;
313   for (const auto& name_fun : tf_function_db.tf_functions()) {
314     const auto& fun = name_fun.second;
315     if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
316       candidates.push_back({name_fun.first, fun.expensive_call_percent()});
317     }
318   }
319   if (candidates.empty()) return "";
320   auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
321     return a.expensive_call_percent > b.expensive_call_percent;
322   };
323   // Sorts candidates in descending order of expensive_call_percent.
324   absl::c_sort(candidates, cmp);
325   std::string expensive_functions = "";
326   auto num_functions_shown = std::min(
327       static_cast<decltype(candidates)::size_type>(3), candidates.size());
328 
329   for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
330     if (i > 0) absl::StrAppend(&expensive_functions, ", ");
331     absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
332                     "\"");
333   }
334   if (candidates.size() > num_functions_shown)
335     absl::StrAppend(&expensive_functions, " and more");
336   return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
337                       ") due to either retracing or eager execution.");
338 }
339 
EagerRecommendationHtml(double host_op_time_eager_percent,double device_op_time_eager_percent)340 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
341                                     double device_op_time_eager_percent) {
342   std::string recommendation = "";
343   if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
344     absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
345                     "% of Op time on the host used eager execution. ");
346   if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
347     absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
348                     "% of Op time on the device used eager execution. ");
349   if (!recommendation.empty())
350     absl::StrAppend(&recommendation, "Performance could be improved with ",
351                     AnchorElement("https://www.tensorflow.org/guide/function",
352                                   "tf.function."));
353   return recommendation;
354 }
355 
OutsideCompilationRecommendationHtml(double device_op_time_outside_compilation_percent)356 std::string OutsideCompilationRecommendationHtml(
357     double device_op_time_outside_compilation_percent) {
358   if (device_op_time_outside_compilation_percent <=
359       kOutsideCompilationThresholdInPercent)
360     return "";
361   return absl::StrCat(
362       OneDigit(device_op_time_outside_compilation_percent),
363       " % of Op time on the device are for outside compilation. Performance "
364       "could be improved by avoiding outside compilation.");
365 }
366 
ConvertOpStatsToOverviewPage(const OpStats & op_stats)367 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
368   OverviewPage overview_page;
369   *overview_page.mutable_run_environment() =
370       ComputeRunEnvironment(op_stats.run_environment());
371   *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
372   *overview_page.mutable_input_analysis() =
373       ConvertOpStatsToInputPipelineAnalysis(op_stats);
374   BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
375       overview_page.input_analysis().input_time_breakdown(),
376       overview_page.input_analysis().step_details());
377   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
378       bottleneck, op_stats.device_op_metrics_db().precision_stats());
379   SetCommonRecommendation(
380       bottleneck.input_classification(), bottleneck.input_statement(), "",
381       ParseHardwareType(op_stats.run_environment().device_type()),
382       TfFunctionRecommendationHtml(op_stats.tf_function_db()),
383       EagerRecommendationHtml(
384           overview_page.analysis().host_op_time_eager_percent(),
385           overview_page.analysis().device_op_time_eager_percent()),
386       OutsideCompilationRecommendationHtml(
387           overview_page.analysis()
388               .device_op_time_outside_compilation_percent()),
389       overview_page.mutable_recommendation());
390   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
391   return overview_page;
392 }
393 
394 }  // namespace profiler
395 }  // namespace tensorflow
396