1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17 
18 #include <math.h>
19 
20 #include <algorithm>
21 #include <vector>
22 
23 #include "google/protobuf/any.pb.h"
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/strings/match.h"
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/lib/gtl/map_util.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
32 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
33 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
34 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
35 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
37 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
38 #include "tensorflow/core/profiler/utils/diagnostics.h"
39 #include "tensorflow/core/profiler/utils/event_span.h"
40 #include "tensorflow/core/profiler/utils/format_utils.h"
41 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
42 #include "tensorflow/core/profiler/utils/html_utils.h"
43 #include "tensorflow/core/profiler/utils/math_utils.h"
44 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
45 #include "tensorflow/core/profiler/utils/time_utils.h"
46 #include "tensorflow/core/util/stats_calculator.h"
47 
48 namespace tensorflow {
49 namespace profiler {
50 
51 namespace {
52 
53 const double kNumPsPerMs = 1000000000.0;
54 
55 // If the percentage of step time that is due to infeed is less than
56 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
57 // input-bound; else if it is less than
58 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
59 // input-bound; else if it is considered HIGHLY input-bound.
60 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
61 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
62 
63 // If the percentage of step time that is due to outfeed is less than
64 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
65 // output-bound; else if it is less than
66 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
67 // output-bound; else if it is considered HIGHLY output-bound.
68 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
69 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
70 
71 // If the percentage of step time that is due to kernel launch is less than
72 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
73 // kernel-launch bound; else if it is less than
74 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
75 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
76 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
77 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
78 
79 // If the percentage of step time that is due to all other time is less than
80 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
81 // all-other bound; else if it is less than
82 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
83 // all-other bound; else if it is considered HIGHLY all-other bound.
84 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
85 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
86 
87 // If the percentage of step time that is due to device collectives is less than
88 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
89 // device-collectives bound; else if it is less than
90 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
91 // device-collectives  bound; else if it is considered HIGHLY device-collectives
92 // bound.
93 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
94 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
95 
96 // Section number of the host-analysis section in the input-pipeline analysis.
97 constexpr int kHostAnalysisSectionNumber = 3;
98 // Python-only explanation for "All Others" time.
99 const char* kAllOthersPythonExplanation =
100     " % of the total step time sampled is spent on 'All Others' time. "
101     "This could be due to Python execution overhead.";
102 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
103 const char* kKernelLaunchTfDataContention =
104     " It could be due to CPU contention with tf.data. In this case, you may "
105     "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
106 
107 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)108 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
109   return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
110 }
111 
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)112 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
113   StepSummary step_time_summary;
114   double avg, sdv, min, max;
115   if (sample_stats.empty()) {
116     // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
117     // prefer to show an 0 instead.
118     avg = sdv = min = max = 0.0;
119   } else {
120     avg = sample_stats.avg();
121     sdv = sqrt(sample_stats.sample_variance());
122     min = sample_stats.min();
123     max = sample_stats.max();
124   }
125   step_time_summary.set_average(avg);
126   step_time_summary.set_standard_deviation(sdv);
127   step_time_summary.set_minimum(min);
128   step_time_summary.set_maximum(max);
129   return step_time_summary;
130 }
131 
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)132 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
133     const InputPipelineAnalysisResult& analysis) {
134   Stat<double> unknown_time_ms;
135   Stat<double> host_wait_input_ms;
136   Stat<double> host_to_device_ms;
137   Stat<double> input_ms;
138   Stat<double> output_ms;
139   Stat<double> device_compute_ms;
140   Stat<double> device_to_device_ms;
141   Stat<double> device_collectives_ms;
142   Stat<double> host_compute_ms;
143   Stat<double> host_prepare_ms;
144   Stat<double> host_compile_ms;
145   GenericStepTimeBreakdown result;
146 
147   for (const google::protobuf::Any& step_details : analysis.step_details()) {
148     PerGenericStepDetails details;
149     bool success = step_details.UnpackTo(&details);
150     if (!success && !step_details.type_url().empty()) {
151       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
152                  << std::endl;
153       return {};
154     }
155     unknown_time_ms.UpdateStat(details.unknown_time_ms());
156     host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
157     host_to_device_ms.UpdateStat(details.host_to_device_ms());
158     input_ms.UpdateStat(details.host_wait_input_ms() +
159                         details.host_to_device_ms());
160     output_ms.UpdateStat(details.output_ms());
161     device_compute_ms.UpdateStat(details.device_compute_ms());
162     device_to_device_ms.UpdateStat(details.device_to_device_ms());
163     device_collectives_ms.UpdateStat(details.device_collectives_ms());
164     host_compute_ms.UpdateStat(details.host_compute_ms());
165     host_prepare_ms.UpdateStat(details.host_prepare_ms());
166     host_compile_ms.UpdateStat(details.host_compile_ms());
167   }
168   *result.mutable_unknown_time_ms_summary() =
169       GetStepSummaryForSampleStats(unknown_time_ms);
170   *result.mutable_host_wait_input_ms_summary() =
171       GetStepSummaryForSampleStats(host_wait_input_ms);
172   *result.mutable_host_to_device_ms_summary() =
173       GetStepSummaryForSampleStats(host_to_device_ms);
174   *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
175   *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
176   *result.mutable_device_compute_ms_summary() =
177       GetStepSummaryForSampleStats(device_compute_ms);
178   *result.mutable_device_to_device_ms_summary() =
179       GetStepSummaryForSampleStats(device_to_device_ms);
180   *result.mutable_device_collectives_ms_summary() =
181       GetStepSummaryForSampleStats(device_collectives_ms);
182   *result.mutable_host_compute_ms_summary() =
183       GetStepSummaryForSampleStats(host_compute_ms);
184   *result.mutable_host_prepare_ms_summary() =
185       GetStepSummaryForSampleStats(host_prepare_ms);
186   *result.mutable_host_compile_ms_summary() =
187       GetStepSummaryForSampleStats(host_compile_ms);
188   return result;
189 }
190 
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)191 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
192     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
193   InputPipelineAnalysisResult result;
194 
195   // Computes the summary of step time in ms.
196   *result.mutable_step_time_summary() =
197       ComputeStepTimeSummaryInMs(grouped_by_step);
198 
199   Stat<double> input_summary_stats_in_percent;
200   for (const auto& coreid_stepinfo_map : grouped_by_step) {
201     // Iterates over each step.
202     const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
203                                       kDefaultGpuLocalCoreId);
204     if (ptr == nullptr) {
205       // For generic hardware, all step-info is put under core-0. If ptr
206       // is nullptr, it means there is no step at all.
207       continue;
208     }
209     const StepInfoResult& step_info = *ptr;
210     // Adds the details for a new step.
211     PerGenericStepDetails details;
212     details.set_step_number(step_info.step_num());
213     details.set_step_time_ms(PicosToMillis(step_info.duration_ps()));
214     GenericStepBreakdown generic;
215     bool success = step_info.step_breakdown().UnpackTo(&generic);
216     if (!success && !step_info.step_breakdown().type_url().empty()) {
217       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
218                  << std::endl;
219       return {};
220     }
221     const auto& type_ps = generic.type_ps();
222     details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
223     details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
224     details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
225                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
226     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
227     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
228                                   GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
229     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
230                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
231     details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
232     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
233     details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
234     details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
235     result.add_step_details()->PackFrom(details);
236 
237     const double input_percent_of_step_time =
238         100.0 *
239         SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
240                    details.step_time_ms());
241     input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
242   }
243 
244   // Computes the summary of input time as percentage of step time.
245   *result.mutable_input_percent_summary() =
246       GetStepSummaryForSampleStats(input_summary_stats_in_percent);
247 
248   // Computes the breakdown of step time.
249   GenericStepTimeBreakdown generic_step_time_breakdown =
250       ComputeGenericStepTimeBreakdownInMs(result);
251   result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
252 
253   return result;
254 }
255 
256 // Classification of input processing on the host.
257 enum class InputOpCategory {
258   kEnqueue,           // enqueue data to be transferred to device.
259   kDemandedFileRead,  // demanded read from file.
260   kAdvancedFileRead,  // advanced read from file (including cached,
261                       // prefetch, parallel-map, interleave).
262   kPreprocessing      // data preprocessing.
263 };
264 
InputOpCategoryString(InputOpCategory category)265 std::string InputOpCategoryString(InputOpCategory category) {
266   switch (category) {
267     case InputOpCategory::kEnqueue:
268       return "Enqueue";
269     case InputOpCategory::kDemandedFileRead:
270       return "Demanded file read";
271     case InputOpCategory::kAdvancedFileRead:
272       return "Advanced file read";
273     case InputOpCategory::kPreprocessing:
274       return "Preprocessing";
275   }
276 }
277 
IsInputOp(absl::string_view category)278 inline bool IsInputOp(absl::string_view category) {
279   // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
280   // that experiences the install stall, not an Op that causes the input stall.
281   return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
282          IsMemcpyHToDOp(category);
283 }
284 
285 // TODO(ckluk):
286 //   Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)287 InputOpCategory CategorizeInputOp(absl::string_view name,
288                                   absl::string_view category) {
289   if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
290     // Ops for sending input from host to device.
291     return InputOpCategory::kEnqueue;
292   }
293   DCHECK(IsDatasetOp(category));
294   if (absl::EndsWith(name, "::TFRecord") ||
295       absl::EndsWith(name, "::TextLine") ||
296       absl::EndsWith(name, "::FixedLengthRecord") ||
297       absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
298     // Ops that read files.
299     if (absl::StrContains(name, "::MemoryReader") ||
300         absl::StrContains(name, "::MemoryWriter") ||
301         absl::StrContains(name, "::Interleave") ||
302         absl::StrContains(name, "::Prefetch") ||
303         absl::StrContains(name, "::ParallelMap")) {
304       // Ops that read files in advance, including caching, interleaving, and
305       // prefetching.
306       return InputOpCategory::kAdvancedFileRead;
307     } else {
308       // Ops that read files on demand.
309       return InputOpCategory::kDemandedFileRead;
310     }
311   } else {
312     // All other ops are classified as preprocessing.
313     return InputOpCategory::kPreprocessing;
314   }
315 }
316 
317 struct InputOpMetrics {
318   std::vector<const OpMetrics*> input_op_metrics;
319   uint64 input_op_time_ps = 0;
320 };
321 
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)322 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
323   InputOpMetrics input_op_metrics;
324   for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
325     if (IsInputOp(op_metrics->category())) {
326       input_op_metrics.input_op_metrics.push_back(op_metrics);
327       input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
328     }
329   }
330   return input_op_metrics;
331 }
332 
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)333 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
334                                                 uint64 input_op_time_ps,
335                                                 InputOpCategory category) {
336   InputOpDetails details;
337   details.set_op_name(op_metrics.name());
338   details.set_count(op_metrics.occurrences());
339   details.set_time_in_ms(PicosToMillis(op_metrics.time_ps()));
340   details.set_self_time_in_ms(PicosToMillis(op_metrics.self_time_ps()));
341   details.set_time_in_percent(
342       100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
343   details.set_self_time_in_percent(
344       100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
345   details.set_category(InputOpCategoryString(category));
346   return details;
347 }
348 
349 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)350 double RatioOfHostToDeviceTimeToStepTime(
351     const OpMetricsDb& host_tf_metrics_db,
352     const InputPipelineAnalysisResult& input_pipeline_analysis) {
353   if (host_tf_metrics_db.total_host_infeed_enq_start_timestamp_ps_diff() > 0) {
354     // For TPU execution that uses infeed.
355     //    We use total_host_infeed_enq_start_timestamp_ps_diff_ to approximate
356     //    the total host step time.
357     return std::min(
358         1.0, SafeDivide(host_tf_metrics_db.total_host_infeed_enq_duration_ps(),
359                         host_tf_metrics_db
360                             .total_host_infeed_enq_start_timestamp_ps_diff()));
361   }
362   // For GPU and TPU execution that doesn't use infeed.
363   double avg_step_time_ms =
364       input_pipeline_analysis.step_time_summary().average();
365   if (avg_step_time_ms > 0) {
366     // Uses the on-device step time.
367     GenericStepTimeBreakdown generic_breakdown;
368     if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
369             &generic_breakdown)) {
370       double avg_host_to_device_time_ms =
371           generic_breakdown.host_to_device_ms_summary().average();
372       return std::min(1.0,
373                       SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms));
374     }
375   }
376   return 0.0;
377 }
378 
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)379 void DeviceCollectivesAnalysis(double device_collectives_percent,
380                                std::string* device_collectives_classification,
381                                std::string* device_collectives_statement) {
382   if (device_collectives_percent >=
383       kHighlyDeviceCollectivesBoundThresholdInPercent) {
384     *device_collectives_classification = "high";
385     *device_collectives_statement =
386         absl::StrCat(OneDigit(device_collectives_percent),
387                      " % of the total step time sampled is spent on 'Device "
388                      "Collective Communication'.");
389   } else if (device_collectives_percent >=
390              kModeratelyDeviceCollectivesBoundThresholdInPercent) {
391     *device_collectives_classification = "moderate";
392     *device_collectives_statement =
393         absl::StrCat(OneDigit(device_collectives_percent),
394                      " % of the total step time sampled is spent on 'Device "
395                      "Collective Communication'.");
396   } else {
397     *device_collectives_classification = "no";
398     *device_collectives_statement = "";
399   }
400 }
401 
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)402 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
403                           std::string* kernel_launch_classification,
404                           std::string* kernel_launch_statement) {
405   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
406     *kernel_launch_classification = "high";
407     *kernel_launch_statement = absl::StrCat(
408         OneDigit(kernel_launch_percent),
409         " % of the total step time sampled is spent on 'Kernel Launch'.");
410     if (tfdata_used) {
411       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
412     }
413   } else if (kernel_launch_percent >=
414              kModeratelyKernelLaunchBoundThresholdInPercent) {
415     *kernel_launch_classification = "moderate";
416     *kernel_launch_statement = absl::StrCat(
417         OneDigit(kernel_launch_percent),
418         " % of the total step time sampled is spent on 'Kernel Launch'.");
419     if (tfdata_used) {
420       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
421     }
422   } else {
423     *kernel_launch_classification = "no";
424     *kernel_launch_statement = "";
425   }
426 }
427 
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)428 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
429                       std::string* all_other_classification,
430                       std::string* all_other_statement) {
431   if (all_other_reported) {
432     *all_other_classification = "no";
433     *all_other_statement = "";
434     return;
435   }
436   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
437     *all_other_classification = "high";
438     *all_other_statement =
439         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
440   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
441     *all_other_classification = "moderate";
442     *all_other_statement =
443         absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
444   } else {
445     *all_other_classification = "no";
446     *all_other_statement = "";
447   }
448 }
449 
450 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)451 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
452   // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
453   // not part of tf.data.
454   return breakdown.demanded_file_read_us() > 0 ||
455          breakdown.advanced_file_read_us() > 0 ||
456          breakdown.preprocessing_us() > 0;
457 }
458 
459 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)460 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
461   return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
462                       "</a>");
463 }
464 
465 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()466 std::string DatasetIntroDoc() {
467   return "https://www.tensorflow.org/guide/data";
468 }
469 
470 }  // namespace
471 
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)472 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
473                         InputPipelineAnalysisResult* result) {
474   InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
475   // Returns if the program is not using an input pipeline with
476   // instrumentation and hence no input ops are found.
477   if (input_op_metrics.input_op_metrics.empty()) return;
478 
479   absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
480   for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
481     InputOpCategory category =
482         CategorizeInputOp(op_metrics->name(), op_metrics->category());
483     *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
484         *op_metrics, input_op_metrics.input_op_time_ps, category);
485     aggregated_input_op_times_us[category] +=
486         PicosToMicros(op_metrics->self_time_ps());
487   }
488 
489   double enqueue_time_us =
490       aggregated_input_op_times_us[InputOpCategory::kEnqueue];
491   double total_input_op_time_us =
492       aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
493       aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
494       aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
495 
496   double ratio = RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result);
497   DCHECK_LE(ratio, 1.0);
498   DCHECK_GE(ratio, 0.0);
499   double non_enqueue_time_us = (ratio != 0.0)
500                                    ? (enqueue_time_us * (1.0 - ratio) / ratio)
501                                    : total_input_op_time_us;
502 
503   // Scales the various input-time components wrt to non_enqueue_time_us.
504   double scaled_demanded_fileread_time_us = SafeDivide(
505       non_enqueue_time_us *
506           aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
507       total_input_op_time_us);
508   double scaled_advanced_fileread_time_us = SafeDivide(
509       non_enqueue_time_us *
510           aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
511       total_input_op_time_us);
512   double scaled_preprocessing_time_us = SafeDivide(
513       non_enqueue_time_us *
514           aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
515       total_input_op_time_us);
516   double unclassified_non_enqueue_time_us = std::max(
517       0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
518                scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
519 
520   InputTimeBreakdown* input_time_breakdown =
521       result->mutable_input_time_breakdown();
522   input_time_breakdown->set_enqueue_us(enqueue_time_us);
523   input_time_breakdown->set_demanded_file_read_us(
524       scaled_demanded_fileread_time_us);
525   input_time_breakdown->set_advanced_file_read_us(
526       scaled_advanced_fileread_time_us);
527   input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
528   input_time_breakdown->set_unclassified_non_enqueue_us(
529       unclassified_non_enqueue_time_us);
530 }
531 
GenerateRecommendation()532 InputPipelineAnalysisRecommendation GenerateRecommendation() {
533   const absl::string_view kDatasetIntro =
534       "https://www.tensorflow.org/programmers_guide/datasets";
535 
536   const absl::string_view kDatasetTopic =
537       "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
538 
539   const absl::string_view kTfRecordDataset =
540       "https://www.tensorflow.org/api_docs/python/tf/data/"
541       "TFRecordDataset#class_tfrecorddataset";
542 
543   InputPipelineAnalysisRecommendation recommendation;
544   *recommendation.add_details() =
545       "Enqueuing data: you may want to combine small input data chunks "
546       "into fewer "
547       "but larger chunks.";
548   *recommendation.add_details() = absl::StrCat(
549       "Data preprocessing: you may increase num_parallel_calls in ",
550       AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
551       " or preprocess the data OFFLINE.");
552   *recommendation.add_details() = absl::StrCat(
553       "Reading data from files in advance: you may tune parameters in the "
554       "following tf.data API (",
555       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
556       ", ",
557       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
558                     "interleave cycle_length"),
559       ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
560   *recommendation.add_details() = absl::StrCat(
561       "Reading data from files on demand: you should read data IN ADVANCE "
562       "using the following tf.data API (",
563       AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
564       AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
565       ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
566   *recommendation.add_details() = absl::StrCat(
567       "Other data reading or processing: you may consider using the ",
568       AnchorElement(kDatasetIntro, "tf.data API"),
569       " (if you are not using it now)");
570 
571   return recommendation;
572 }
573 
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)574 StepSummary ComputeStepTimeSummaryInMs(
575     const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
576   Stat<double> total_step_stats_in_ms;
577   // iterates over each step.
578   for (const auto& coreid_stepinfo_map : grouped_by_step) {
579     double max_per_step_stats_in_ms = 0.0;
580     // iterates over each core.
581     for (const auto& coreid_and_stepinfo :
582          coreid_stepinfo_map.step_info_per_core()) {
583       const auto& step_info = coreid_and_stepinfo.second;
584       max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
585                                           max_per_step_stats_in_ms);
586     }
587     // Step time of each step is determined by the slowest core.
588     total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
589   }
590 
591   return GetStepSummaryForSampleStats(total_step_stats_in_ms);
592 }
593 
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)594 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
595     const OpStats& op_stats) {
596   InputPipelineAnalysisResult result =
597       ComputeGenericInputPipelineAnalysisResult(
598           op_stats.step_db().step_sequence());
599   PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
600   result.set_hardware_type(HardwareType_Name(
601       ParseHardwareType(op_stats.run_environment().device_type())));
602   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
603 
604   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
605   BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
606       result.input_time_breakdown(), result.step_details());
607   result.set_input_percent(bottleneck_analysis.input_percent());
608   result.set_output_percent(bottleneck_analysis.output_percent());
609   result.set_idle_percent(bottleneck_analysis.idle_percent());
610   result.set_compute_percent(bottleneck_analysis.compute_percent());
611 
612   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
613   *recommendation.mutable_summary_next_step() =
614       GetSummaryNextStep(bottleneck_analysis.input_classification(),
615                          result.input_time_breakdown());
616 
617   *result.mutable_recommendation() = recommendation;
618   return result;
619 }
620 
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)621 bool InputAnalysis(double input_percent, double all_other_percent,
622                    std::string* input_classification,
623                    std::string* input_statement) {
624   absl::string_view non_input_time = "other time";
625   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
626     *input_classification = "host";
627     *input_statement = absl::StrCat(
628         "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
629         "% of the total step time sampled is waiting for input. Therefore, you "
630         "should first focus on reducing the input time.");
631     return false;
632   } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
633     *input_classification = "both";
634     *input_statement = absl::StrCat(
635         "Your program is MODERATELY input-bound because ",
636         OneDigit(input_percent),
637         "% of the total step time sampled is waiting for input. Therefore, "
638         "you would need to reduce both the input time and ",
639         non_input_time, ".");
640     return false;
641   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
642     // Input analysis says it is not input-bound, but "All-Other" time
643     // is significant. It could still be input-bound (or Python overhead).
644     *input_classification = "both";
645     *input_statement = absl::StrCat(
646         "Your program is POTENTIALLY input-bound because ",
647         OneDigit(all_other_percent),
648         "% of the total step time sampled is spent on 'All Others' time (which "
649         "could be due to I/O or Python execution or both).");
650     return true;
651   } else {
652     // Defintely not input-bound.
653     *input_classification = "device";
654     *input_statement =
655         absl::StrCat("Your program is NOT input-bound because only ",
656                      OneDigit(input_percent),
657                      "% of the total step time sampled is waiting for "
658                      "input. Therefore, you should focus on "
659                      "reducing ",
660                      non_input_time, ".");
661     return false;
662   }
663 }
664 
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)665 void OutputAnalysis(double output_percent, std::string* output_classification,
666                     std::string* output_statement) {
667   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
668     *output_classification = "host";
669     *output_statement = absl::StrCat(
670         "Your program is HIGHLY output-bound because ",
671         OneDigit(output_percent),
672         "% of the total step time sampled is spent on output. Therefore, you "
673         "should first focus on reducing the output time.");
674   } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
675     *output_classification = "both";
676     *output_statement = absl::StrCat(
677         "Your program is MODERATELY output-bound because ",
678         OneDigit(output_percent),
679         "% of the total step time sampled is spent on output. Therefore, "
680         "you would need to reduce both the output time and other time.");
681   } else {
682     *output_classification = "device";
683     *output_statement = "";
684   }
685 }
686 
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)687 BottleneckAnalysis ComputeBottleneckAnalysis(
688     const InputTimeBreakdown& input_time_breakdown,
689     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
690         any_step_details) {
691   double total_step_time_ms = 0;
692   double total_input_ms = 0;
693   double total_output_ms = 0;
694   double total_host_compute_ms = 0;
695   double total_host_prepare_ms = 0;
696   double total_host_compile_ms = 0;
697   double total_device_compute_ms = 0;
698   double total_device_to_device_ms = 0;
699   double total_device_collectives_ms = 0;
700   double total_unknown_ms = 0;
701 
702   for (const google::protobuf::Any& step_details : any_step_details) {
703     PerGenericStepDetails details;
704     bool success = step_details.UnpackTo(&details);
705     if (!success && !step_details.type_url().empty()) {
706       LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
707                  << std::endl;
708       return {};
709     }
710     total_step_time_ms += details.step_time_ms();
711     total_input_ms +=
712         details.host_wait_input_ms() + details.host_to_device_ms();
713     total_output_ms += details.output_ms();
714     total_host_prepare_ms += details.host_prepare_ms();
715     total_device_compute_ms += details.device_compute_ms();
716     total_device_to_device_ms += details.device_to_device_ms();
717     total_device_collectives_ms += details.device_collectives_ms();
718     total_host_compute_ms += details.host_compute_ms();
719     total_host_compile_ms += details.host_compile_ms();
720     total_unknown_ms += details.unknown_time_ms();
721   }
722 
723   if (total_step_time_ms == 0) {
724     BottleneckAnalysis analysis;
725     analysis.set_input_classification("unknown");
726     analysis.set_input_statement(
727         "No step time measured. Therefore we cannot tell where the "
728         "performance bottleneck is.");
729     analysis.set_kernel_launch_classification("no");
730     analysis.set_kernel_launch_statement("");
731     analysis.set_all_other_classification("no");
732     analysis.set_all_other_statement("");
733     analysis.set_device_collectives_classification("no");
734     analysis.set_device_collectives_statement("");
735     return analysis;
736   }
737   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
738   double output_percent = 100.0 * total_output_ms / total_step_time_ms;
739   double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
740   double device_collectives_percent =
741       100.0 * total_device_collectives_ms / total_step_time_ms;
742 
743   // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
744   // host compute, host compile, and unknown.
745   double idle_percent =
746       std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
747                         device_collectives_percent);
748   double kernel_launch_percent =
749       100.0 * total_host_prepare_ms / total_step_time_ms;
750   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
751 
752   std::string input_classification;
753   std::string input_statement;
754   bool all_other_reported =
755       InputAnalysis(input_percent, all_other_percent, &input_classification,
756                     &input_statement);
757 
758   std::string device_collectives_classification;
759   std::string device_collectives_statement;
760   DeviceCollectivesAnalysis(device_collectives_percent,
761                             &device_collectives_classification,
762                             &device_collectives_statement);
763 
764   std::string kernel_launch_classification;
765   std::string kernel_launch_statement;
766   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
767                        &kernel_launch_classification, &kernel_launch_statement);
768 
769   std::string all_other_classification;
770   std::string all_other_statement;
771   AllOtherAnalysis(all_other_reported, all_other_percent,
772                    &all_other_classification, &all_other_statement);
773 
774   BottleneckAnalysis analysis;
775   analysis.set_input_percent(input_percent);
776   analysis.set_output_percent(output_percent);
777   analysis.set_idle_percent(idle_percent);
778   analysis.set_compute_percent(compute_percent);
779 
780   analysis.set_input_classification(input_classification);
781   analysis.set_input_statement(input_statement);
782   analysis.set_kernel_launch_classification(kernel_launch_classification);
783   analysis.set_kernel_launch_statement(kernel_launch_statement);
784   analysis.set_all_other_classification(all_other_classification);
785   analysis.set_all_other_statement(all_other_statement);
786   analysis.set_device_collectives_classification(
787       device_collectives_classification);
788   analysis.set_device_collectives_statement(device_collectives_statement);
789 
790   return analysis;
791 }
792 
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)793 std::string GetSummaryNextStep(absl::string_view input_classification,
794                                const InputTimeBreakdown& breakdown) {
795   std::string summary_next_step;
796   if (input_classification == "host" || input_classification == "both") {
797     if (!TfDataInUse(breakdown)) {
798       summary_next_step = absl::StrCat(
799           "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
800           " to enable profiler's host-side analysis for input pipeline. "
801           "Profiler currently does not support custom input pipeline (please "
802           "ignore "
803           "Section ",
804           kHostAnalysisSectionNumber, " below).");
805     } else {
806       summary_next_step =
807           absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
808                        " for the breakdown of input time on the host.");
809     }
810   } else {
811     summary_next_step = "You may skip the rest of this page.";
812   }
813 
814   return summary_next_step;
815 }
816 
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)817 double HostToDeviceTransferAsPercentOfInputTime(
818     const InputTimeBreakdown& breakdown) {
819   // Thanks to the scaling trick we did in GenerateHostResult(), we can
820   // estimate the percentage of input-time spent on host-to-device transfer in
821   // the following way.
822   double total_input_time_us =
823       breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
824       breakdown.preprocessing_us() + breakdown.enqueue_us() +
825       breakdown.unclassified_non_enqueue_us();
826   return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
827 }
828 
829 }  // namespace profiler
830 }  // namespace tensorflow
831