1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
17
18 #include <math.h>
19
20 #include <algorithm>
21 #include <vector>
22
23 #include "google/protobuf/any.pb.h"
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/strings/match.h"
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/lib/gtl/map_util.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
32 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
33 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
34 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
35 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
36 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
37 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
38 #include "tensorflow/core/profiler/utils/diagnostics.h"
39 #include "tensorflow/core/profiler/utils/event_span.h"
40 #include "tensorflow/core/profiler/utils/format_utils.h"
41 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
42 #include "tensorflow/core/profiler/utils/html_utils.h"
43 #include "tensorflow/core/profiler/utils/math_utils.h"
44 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
45 #include "tensorflow/core/profiler/utils/time_utils.h"
46 #include "tensorflow/core/util/stats_calculator.h"
47
48 namespace tensorflow {
49 namespace profiler {
50
51 namespace {
52
53 const double kNumPsPerMs = 1000000000.0;
54
55 // If the percentage of step time that is due to infeed is less than
56 // kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
57 // input-bound; else if it is less than
58 // kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
59 // input-bound; else if it is considered HIGHLY input-bound.
60 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
61 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
62
63 // If the percentage of step time that is due to outfeed is less than
64 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
65 // output-bound; else if it is less than
66 // kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
67 // output-bound; else if it is considered HIGHLY output-bound.
68 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
69 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
70
71 // If the percentage of step time that is due to kernel launch is less than
72 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
73 // kernel-launch bound; else if it is less than
74 // kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
75 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
76 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
77 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
78
79 // If the percentage of step time that is due to all other time is less than
80 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
81 // all-other bound; else if it is less than
82 // kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
83 // all-other bound; else if it is considered HIGHLY all-other bound.
84 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
85 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
86
87 // If the percentage of step time that is due to device collectives is less than
88 // kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
89 // device-collectives bound; else if it is less than
90 // kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
91 // device-collectives bound; else if it is considered HIGHLY device-collectives
92 // bound.
93 constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
94 constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
95
96 // Section number of the host-analysis section in the input-pipeline analysis.
97 constexpr int kHostAnalysisSectionNumber = 3;
98 // Python-only explanation for "All Others" time.
99 const char* kAllOthersPythonExplanation =
100 " % of the total step time sampled is spent on 'All Others' time. "
101 "This could be due to Python execution overhead.";
102 // Explanation for "Kernel Launch" time due to CPU contention with tf.data.
103 const char* kKernelLaunchTfDataContention =
104 " It could be due to CPU contention with tf.data. In this case, you may "
105 "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
106
107 template <class Collection>
GetTimeInMs(const Collection & type_ps,EventType event_type)108 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
109 return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
110 }
111
GetStepSummaryForSampleStats(const Stat<double> & sample_stats)112 StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
113 StepSummary step_time_summary;
114 double avg, sdv, min, max;
115 if (sample_stats.empty()) {
116 // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
117 // prefer to show an 0 instead.
118 avg = sdv = min = max = 0.0;
119 } else {
120 avg = sample_stats.avg();
121 sdv = sqrt(sample_stats.sample_variance());
122 min = sample_stats.min();
123 max = sample_stats.max();
124 }
125 step_time_summary.set_average(avg);
126 step_time_summary.set_standard_deviation(sdv);
127 step_time_summary.set_minimum(min);
128 step_time_summary.set_maximum(max);
129 return step_time_summary;
130 }
131
ComputeGenericStepTimeBreakdownInMs(const InputPipelineAnalysisResult & analysis)132 GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
133 const InputPipelineAnalysisResult& analysis) {
134 Stat<double> unknown_time_ms;
135 Stat<double> host_wait_input_ms;
136 Stat<double> host_to_device_ms;
137 Stat<double> input_ms;
138 Stat<double> output_ms;
139 Stat<double> device_compute_ms;
140 Stat<double> device_to_device_ms;
141 Stat<double> device_collectives_ms;
142 Stat<double> host_compute_ms;
143 Stat<double> host_prepare_ms;
144 Stat<double> host_compile_ms;
145 GenericStepTimeBreakdown result;
146
147 for (const google::protobuf::Any& step_details : analysis.step_details()) {
148 PerGenericStepDetails details;
149 bool success = step_details.UnpackTo(&details);
150 if (!success && !step_details.type_url().empty()) {
151 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
152 << std::endl;
153 return {};
154 }
155 unknown_time_ms.UpdateStat(details.unknown_time_ms());
156 host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
157 host_to_device_ms.UpdateStat(details.host_to_device_ms());
158 input_ms.UpdateStat(details.host_wait_input_ms() +
159 details.host_to_device_ms());
160 output_ms.UpdateStat(details.output_ms());
161 device_compute_ms.UpdateStat(details.device_compute_ms());
162 device_to_device_ms.UpdateStat(details.device_to_device_ms());
163 device_collectives_ms.UpdateStat(details.device_collectives_ms());
164 host_compute_ms.UpdateStat(details.host_compute_ms());
165 host_prepare_ms.UpdateStat(details.host_prepare_ms());
166 host_compile_ms.UpdateStat(details.host_compile_ms());
167 }
168 *result.mutable_unknown_time_ms_summary() =
169 GetStepSummaryForSampleStats(unknown_time_ms);
170 *result.mutable_host_wait_input_ms_summary() =
171 GetStepSummaryForSampleStats(host_wait_input_ms);
172 *result.mutable_host_to_device_ms_summary() =
173 GetStepSummaryForSampleStats(host_to_device_ms);
174 *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
175 *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
176 *result.mutable_device_compute_ms_summary() =
177 GetStepSummaryForSampleStats(device_compute_ms);
178 *result.mutable_device_to_device_ms_summary() =
179 GetStepSummaryForSampleStats(device_to_device_ms);
180 *result.mutable_device_collectives_ms_summary() =
181 GetStepSummaryForSampleStats(device_collectives_ms);
182 *result.mutable_host_compute_ms_summary() =
183 GetStepSummaryForSampleStats(host_compute_ms);
184 *result.mutable_host_prepare_ms_summary() =
185 GetStepSummaryForSampleStats(host_prepare_ms);
186 *result.mutable_host_compile_ms_summary() =
187 GetStepSummaryForSampleStats(host_compile_ms);
188 return result;
189 }
190
ComputeGenericInputPipelineAnalysisResult(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)191 InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
192 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
193 InputPipelineAnalysisResult result;
194
195 // Computes the summary of step time in ms.
196 *result.mutable_step_time_summary() =
197 ComputeStepTimeSummaryInMs(grouped_by_step);
198
199 Stat<double> input_summary_stats_in_percent;
200 for (const auto& coreid_stepinfo_map : grouped_by_step) {
201 // Iterates over each step.
202 const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
203 kDefaultGpuLocalCoreId);
204 if (ptr == nullptr) {
205 // For generic hardware, all step-info is put under core-0. If ptr
206 // is nullptr, it means there is no step at all.
207 continue;
208 }
209 const StepInfoResult& step_info = *ptr;
210 // Adds the details for a new step.
211 PerGenericStepDetails details;
212 details.set_step_number(step_info.step_num());
213 details.set_step_time_ms(PicosToMillis(step_info.duration_ps()));
214 GenericStepBreakdown generic;
215 bool success = step_info.step_breakdown().UnpackTo(&generic);
216 if (!success && !step_info.step_breakdown().type_url().empty()) {
217 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
218 << std::endl;
219 return {};
220 }
221 const auto& type_ps = generic.type_ps();
222 details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
223 details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
224 details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
225 GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
226 details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
227 details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
228 GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
229 details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
230 GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
231 details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
232 details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
233 details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
234 details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
235 result.add_step_details()->PackFrom(details);
236
237 const double input_percent_of_step_time =
238 100.0 *
239 SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
240 details.step_time_ms());
241 input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
242 }
243
244 // Computes the summary of input time as percentage of step time.
245 *result.mutable_input_percent_summary() =
246 GetStepSummaryForSampleStats(input_summary_stats_in_percent);
247
248 // Computes the breakdown of step time.
249 GenericStepTimeBreakdown generic_step_time_breakdown =
250 ComputeGenericStepTimeBreakdownInMs(result);
251 result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
252
253 return result;
254 }
255
256 // Classification of input processing on the host.
257 enum class InputOpCategory {
258 kEnqueue, // enqueue data to be transferred to device.
259 kDemandedFileRead, // demanded read from file.
260 kAdvancedFileRead, // advanced read from file (including cached,
261 // prefetch, parallel-map, interleave).
262 kPreprocessing // data preprocessing.
263 };
264
InputOpCategoryString(InputOpCategory category)265 std::string InputOpCategoryString(InputOpCategory category) {
266 switch (category) {
267 case InputOpCategory::kEnqueue:
268 return "Enqueue";
269 case InputOpCategory::kDemandedFileRead:
270 return "Demanded file read";
271 case InputOpCategory::kAdvancedFileRead:
272 return "Advanced file read";
273 case InputOpCategory::kPreprocessing:
274 return "Preprocessing";
275 }
276 }
277
IsInputOp(absl::string_view category)278 inline bool IsInputOp(absl::string_view category) {
279 // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
280 // that experiences the install stall, not an Op that causes the input stall.
281 return IsInfeedEnqueueOp(category) || IsDatasetOp(category) ||
282 IsMemcpyHToDOp(category);
283 }
284
285 // TODO(ckluk):
286 // Confirm with the tf.data team if the classification below is correct.
CategorizeInputOp(absl::string_view name,absl::string_view category)287 InputOpCategory CategorizeInputOp(absl::string_view name,
288 absl::string_view category) {
289 if (IsInfeedEnqueueOp(category) || IsMemcpyHToDOp(category)) {
290 // Ops for sending input from host to device.
291 return InputOpCategory::kEnqueue;
292 }
293 DCHECK(IsDatasetOp(category));
294 if (absl::EndsWith(name, "::TFRecord") ||
295 absl::EndsWith(name, "::TextLine") ||
296 absl::EndsWith(name, "::FixedLengthRecord") ||
297 absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
298 // Ops that read files.
299 if (absl::StrContains(name, "::MemoryReader") ||
300 absl::StrContains(name, "::MemoryWriter") ||
301 absl::StrContains(name, "::Interleave") ||
302 absl::StrContains(name, "::Prefetch") ||
303 absl::StrContains(name, "::ParallelMap")) {
304 // Ops that read files in advance, including caching, interleaving, and
305 // prefetching.
306 return InputOpCategory::kAdvancedFileRead;
307 } else {
308 // Ops that read files on demand.
309 return InputOpCategory::kDemandedFileRead;
310 }
311 } else {
312 // All other ops are classified as preprocessing.
313 return InputOpCategory::kPreprocessing;
314 }
315 }
316
317 struct InputOpMetrics {
318 std::vector<const OpMetrics*> input_op_metrics;
319 uint64 input_op_time_ps = 0;
320 };
321
SelectInputOpMetrics(const OpMetricsDb & all_op_metrics)322 InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
323 InputOpMetrics input_op_metrics;
324 for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
325 if (IsInputOp(op_metrics->category())) {
326 input_op_metrics.input_op_metrics.push_back(op_metrics);
327 input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
328 }
329 }
330 return input_op_metrics;
331 }
332
ConvertOpMetricsToInputOpDetails(const OpMetrics & op_metrics,uint64 input_op_time_ps,InputOpCategory category)333 InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
334 uint64 input_op_time_ps,
335 InputOpCategory category) {
336 InputOpDetails details;
337 details.set_op_name(op_metrics.name());
338 details.set_count(op_metrics.occurrences());
339 details.set_time_in_ms(PicosToMillis(op_metrics.time_ps()));
340 details.set_self_time_in_ms(PicosToMillis(op_metrics.self_time_ps()));
341 details.set_time_in_percent(
342 100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
343 details.set_self_time_in_percent(
344 100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
345 details.set_category(InputOpCategoryString(category));
346 return details;
347 }
348
349 // Returns the ratio of the host-to-device time in each step to the step-time.
RatioOfHostToDeviceTimeToStepTime(const OpMetricsDb & host_tf_metrics_db,const InputPipelineAnalysisResult & input_pipeline_analysis)350 double RatioOfHostToDeviceTimeToStepTime(
351 const OpMetricsDb& host_tf_metrics_db,
352 const InputPipelineAnalysisResult& input_pipeline_analysis) {
353 if (host_tf_metrics_db.total_host_infeed_enq_start_timestamp_ps_diff() > 0) {
354 // For TPU execution that uses infeed.
355 // We use total_host_infeed_enq_start_timestamp_ps_diff_ to approximate
356 // the total host step time.
357 return std::min(
358 1.0, SafeDivide(host_tf_metrics_db.total_host_infeed_enq_duration_ps(),
359 host_tf_metrics_db
360 .total_host_infeed_enq_start_timestamp_ps_diff()));
361 }
362 // For GPU and TPU execution that doesn't use infeed.
363 double avg_step_time_ms =
364 input_pipeline_analysis.step_time_summary().average();
365 if (avg_step_time_ms > 0) {
366 // Uses the on-device step time.
367 GenericStepTimeBreakdown generic_breakdown;
368 if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
369 &generic_breakdown)) {
370 double avg_host_to_device_time_ms =
371 generic_breakdown.host_to_device_ms_summary().average();
372 return std::min(1.0,
373 SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms));
374 }
375 }
376 return 0.0;
377 }
378
DeviceCollectivesAnalysis(double device_collectives_percent,std::string * device_collectives_classification,std::string * device_collectives_statement)379 void DeviceCollectivesAnalysis(double device_collectives_percent,
380 std::string* device_collectives_classification,
381 std::string* device_collectives_statement) {
382 if (device_collectives_percent >=
383 kHighlyDeviceCollectivesBoundThresholdInPercent) {
384 *device_collectives_classification = "high";
385 *device_collectives_statement =
386 absl::StrCat(OneDigit(device_collectives_percent),
387 " % of the total step time sampled is spent on 'Device "
388 "Collective Communication'.");
389 } else if (device_collectives_percent >=
390 kModeratelyDeviceCollectivesBoundThresholdInPercent) {
391 *device_collectives_classification = "moderate";
392 *device_collectives_statement =
393 absl::StrCat(OneDigit(device_collectives_percent),
394 " % of the total step time sampled is spent on 'Device "
395 "Collective Communication'.");
396 } else {
397 *device_collectives_classification = "no";
398 *device_collectives_statement = "";
399 }
400 }
401
KernelLaunchAnalysis(bool tfdata_used,double kernel_launch_percent,std::string * kernel_launch_classification,std::string * kernel_launch_statement)402 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
403 std::string* kernel_launch_classification,
404 std::string* kernel_launch_statement) {
405 if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
406 *kernel_launch_classification = "high";
407 *kernel_launch_statement = absl::StrCat(
408 OneDigit(kernel_launch_percent),
409 " % of the total step time sampled is spent on 'Kernel Launch'.");
410 if (tfdata_used) {
411 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
412 }
413 } else if (kernel_launch_percent >=
414 kModeratelyKernelLaunchBoundThresholdInPercent) {
415 *kernel_launch_classification = "moderate";
416 *kernel_launch_statement = absl::StrCat(
417 OneDigit(kernel_launch_percent),
418 " % of the total step time sampled is spent on 'Kernel Launch'.");
419 if (tfdata_used) {
420 absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
421 }
422 } else {
423 *kernel_launch_classification = "no";
424 *kernel_launch_statement = "";
425 }
426 }
427
AllOtherAnalysis(bool all_other_reported,double all_other_percent,std::string * all_other_classification,std::string * all_other_statement)428 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
429 std::string* all_other_classification,
430 std::string* all_other_statement) {
431 if (all_other_reported) {
432 *all_other_classification = "no";
433 *all_other_statement = "";
434 return;
435 }
436 if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
437 *all_other_classification = "high";
438 *all_other_statement =
439 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
440 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
441 *all_other_classification = "moderate";
442 *all_other_statement =
443 absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
444 } else {
445 *all_other_classification = "no";
446 *all_other_statement = "";
447 }
448 }
449
450 // Tests if tf.data API is in use.
TfDataInUse(const InputTimeBreakdown & breakdown)451 bool TfDataInUse(const InputTimeBreakdown& breakdown) {
452 // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
453 // not part of tf.data.
454 return breakdown.demanded_file_read_us() > 0 ||
455 breakdown.advanced_file_read_us() > 0 ||
456 breakdown.preprocessing_us() > 0;
457 }
458
459 // Returns a HTML link with the given text.
MakeDocLink(absl::string_view doc_link,absl::string_view text)460 std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
461 return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
462 "</a>");
463 }
464
465 // Returns the HTML link to the introduction to the tf.data API.
DatasetIntroDoc()466 std::string DatasetIntroDoc() {
467 return "https://www.tensorflow.org/guide/data";
468 }
469
470 } // namespace
471
GenerateHostResult(const OpMetricsDb & host_tf_metrics_db,InputPipelineAnalysisResult * result)472 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
473 InputPipelineAnalysisResult* result) {
474 InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
475 // Returns if the program is not using an input pipeline with
476 // instrumentation and hence no input ops are found.
477 if (input_op_metrics.input_op_metrics.empty()) return;
478
479 absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
480 for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
481 InputOpCategory category =
482 CategorizeInputOp(op_metrics->name(), op_metrics->category());
483 *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
484 *op_metrics, input_op_metrics.input_op_time_ps, category);
485 aggregated_input_op_times_us[category] +=
486 PicosToMicros(op_metrics->self_time_ps());
487 }
488
489 double enqueue_time_us =
490 aggregated_input_op_times_us[InputOpCategory::kEnqueue];
491 double total_input_op_time_us =
492 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
493 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
494 aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
495
496 double ratio = RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result);
497 DCHECK_LE(ratio, 1.0);
498 DCHECK_GE(ratio, 0.0);
499 double non_enqueue_time_us = (ratio != 0.0)
500 ? (enqueue_time_us * (1.0 - ratio) / ratio)
501 : total_input_op_time_us;
502
503 // Scales the various input-time components wrt to non_enqueue_time_us.
504 double scaled_demanded_fileread_time_us = SafeDivide(
505 non_enqueue_time_us *
506 aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
507 total_input_op_time_us);
508 double scaled_advanced_fileread_time_us = SafeDivide(
509 non_enqueue_time_us *
510 aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
511 total_input_op_time_us);
512 double scaled_preprocessing_time_us = SafeDivide(
513 non_enqueue_time_us *
514 aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
515 total_input_op_time_us);
516 double unclassified_non_enqueue_time_us = std::max(
517 0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
518 scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
519
520 InputTimeBreakdown* input_time_breakdown =
521 result->mutable_input_time_breakdown();
522 input_time_breakdown->set_enqueue_us(enqueue_time_us);
523 input_time_breakdown->set_demanded_file_read_us(
524 scaled_demanded_fileread_time_us);
525 input_time_breakdown->set_advanced_file_read_us(
526 scaled_advanced_fileread_time_us);
527 input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
528 input_time_breakdown->set_unclassified_non_enqueue_us(
529 unclassified_non_enqueue_time_us);
530 }
531
GenerateRecommendation()532 InputPipelineAnalysisRecommendation GenerateRecommendation() {
533 const absl::string_view kDatasetIntro =
534 "https://www.tensorflow.org/programmers_guide/datasets";
535
536 const absl::string_view kDatasetTopic =
537 "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
538
539 const absl::string_view kTfRecordDataset =
540 "https://www.tensorflow.org/api_docs/python/tf/data/"
541 "TFRecordDataset#class_tfrecorddataset";
542
543 InputPipelineAnalysisRecommendation recommendation;
544 *recommendation.add_details() =
545 "Enqueuing data: you may want to combine small input data chunks "
546 "into fewer "
547 "but larger chunks.";
548 *recommendation.add_details() = absl::StrCat(
549 "Data preprocessing: you may increase num_parallel_calls in ",
550 AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
551 " or preprocess the data OFFLINE.");
552 *recommendation.add_details() = absl::StrCat(
553 "Reading data from files in advance: you may tune parameters in the "
554 "following tf.data API (",
555 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
556 ", ",
557 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
558 "interleave cycle_length"),
559 ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
560 *recommendation.add_details() = absl::StrCat(
561 "Reading data from files on demand: you should read data IN ADVANCE "
562 "using the following tf.data API (",
563 AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
564 AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
565 ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
566 *recommendation.add_details() = absl::StrCat(
567 "Other data reading or processing: you may consider using the ",
568 AnchorElement(kDatasetIntro, "tf.data API"),
569 " (if you are not using it now)");
570
571 return recommendation;
572 }
573
ComputeStepTimeSummaryInMs(const protobuf::RepeatedPtrField<PerCoreStepInfo> & grouped_by_step)574 StepSummary ComputeStepTimeSummaryInMs(
575 const protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
576 Stat<double> total_step_stats_in_ms;
577 // iterates over each step.
578 for (const auto& coreid_stepinfo_map : grouped_by_step) {
579 double max_per_step_stats_in_ms = 0.0;
580 // iterates over each core.
581 for (const auto& coreid_and_stepinfo :
582 coreid_stepinfo_map.step_info_per_core()) {
583 const auto& step_info = coreid_and_stepinfo.second;
584 max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
585 max_per_step_stats_in_ms);
586 }
587 // Step time of each step is determined by the slowest core.
588 total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
589 }
590
591 return GetStepSummaryForSampleStats(total_step_stats_in_ms);
592 }
593
ConvertOpStatsToInputPipelineAnalysis(const OpStats & op_stats)594 InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
595 const OpStats& op_stats) {
596 InputPipelineAnalysisResult result =
597 ComputeGenericInputPipelineAnalysisResult(
598 op_stats.step_db().step_sequence());
599 PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
600 result.set_hardware_type(HardwareType_Name(
601 ParseHardwareType(op_stats.run_environment().device_type())));
602 GenerateHostResult(op_stats.host_op_metrics_db(), &result);
603
604 InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
605 BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
606 result.input_time_breakdown(), result.step_details());
607 result.set_input_percent(bottleneck_analysis.input_percent());
608 result.set_output_percent(bottleneck_analysis.output_percent());
609 result.set_idle_percent(bottleneck_analysis.idle_percent());
610 result.set_compute_percent(bottleneck_analysis.compute_percent());
611
612 recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
613 *recommendation.mutable_summary_next_step() =
614 GetSummaryNextStep(bottleneck_analysis.input_classification(),
615 result.input_time_breakdown());
616
617 *result.mutable_recommendation() = recommendation;
618 return result;
619 }
620
InputAnalysis(double input_percent,double all_other_percent,std::string * input_classification,std::string * input_statement)621 bool InputAnalysis(double input_percent, double all_other_percent,
622 std::string* input_classification,
623 std::string* input_statement) {
624 absl::string_view non_input_time = "other time";
625 if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
626 *input_classification = "host";
627 *input_statement = absl::StrCat(
628 "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
629 "% of the total step time sampled is waiting for input. Therefore, you "
630 "should first focus on reducing the input time.");
631 return false;
632 } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
633 *input_classification = "both";
634 *input_statement = absl::StrCat(
635 "Your program is MODERATELY input-bound because ",
636 OneDigit(input_percent),
637 "% of the total step time sampled is waiting for input. Therefore, "
638 "you would need to reduce both the input time and ",
639 non_input_time, ".");
640 return false;
641 } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
642 // Input analysis says it is not input-bound, but "All-Other" time
643 // is significant. It could still be input-bound (or Python overhead).
644 *input_classification = "both";
645 *input_statement = absl::StrCat(
646 "Your program is POTENTIALLY input-bound because ",
647 OneDigit(all_other_percent),
648 "% of the total step time sampled is spent on 'All Others' time (which "
649 "could be due to I/O or Python execution or both).");
650 return true;
651 } else {
652 // Defintely not input-bound.
653 *input_classification = "device";
654 *input_statement =
655 absl::StrCat("Your program is NOT input-bound because only ",
656 OneDigit(input_percent),
657 "% of the total step time sampled is waiting for "
658 "input. Therefore, you should focus on "
659 "reducing ",
660 non_input_time, ".");
661 return false;
662 }
663 }
664
OutputAnalysis(double output_percent,std::string * output_classification,std::string * output_statement)665 void OutputAnalysis(double output_percent, std::string* output_classification,
666 std::string* output_statement) {
667 if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
668 *output_classification = "host";
669 *output_statement = absl::StrCat(
670 "Your program is HIGHLY output-bound because ",
671 OneDigit(output_percent),
672 "% of the total step time sampled is spent on output. Therefore, you "
673 "should first focus on reducing the output time.");
674 } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
675 *output_classification = "both";
676 *output_statement = absl::StrCat(
677 "Your program is MODERATELY output-bound because ",
678 OneDigit(output_percent),
679 "% of the total step time sampled is spent on output. Therefore, "
680 "you would need to reduce both the output time and other time.");
681 } else {
682 *output_classification = "device";
683 *output_statement = "";
684 }
685 }
686
ComputeBottleneckAnalysis(const InputTimeBreakdown & input_time_breakdown,const::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any> & any_step_details)687 BottleneckAnalysis ComputeBottleneckAnalysis(
688 const InputTimeBreakdown& input_time_breakdown,
689 const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
690 any_step_details) {
691 double total_step_time_ms = 0;
692 double total_input_ms = 0;
693 double total_output_ms = 0;
694 double total_host_compute_ms = 0;
695 double total_host_prepare_ms = 0;
696 double total_host_compile_ms = 0;
697 double total_device_compute_ms = 0;
698 double total_device_to_device_ms = 0;
699 double total_device_collectives_ms = 0;
700 double total_unknown_ms = 0;
701
702 for (const google::protobuf::Any& step_details : any_step_details) {
703 PerGenericStepDetails details;
704 bool success = step_details.UnpackTo(&details);
705 if (!success && !step_details.type_url().empty()) {
706 LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
707 << std::endl;
708 return {};
709 }
710 total_step_time_ms += details.step_time_ms();
711 total_input_ms +=
712 details.host_wait_input_ms() + details.host_to_device_ms();
713 total_output_ms += details.output_ms();
714 total_host_prepare_ms += details.host_prepare_ms();
715 total_device_compute_ms += details.device_compute_ms();
716 total_device_to_device_ms += details.device_to_device_ms();
717 total_device_collectives_ms += details.device_collectives_ms();
718 total_host_compute_ms += details.host_compute_ms();
719 total_host_compile_ms += details.host_compile_ms();
720 total_unknown_ms += details.unknown_time_ms();
721 }
722
723 if (total_step_time_ms == 0) {
724 BottleneckAnalysis analysis;
725 analysis.set_input_classification("unknown");
726 analysis.set_input_statement(
727 "No step time measured. Therefore we cannot tell where the "
728 "performance bottleneck is.");
729 analysis.set_kernel_launch_classification("no");
730 analysis.set_kernel_launch_statement("");
731 analysis.set_all_other_classification("no");
732 analysis.set_all_other_statement("");
733 analysis.set_device_collectives_classification("no");
734 analysis.set_device_collectives_statement("");
735 return analysis;
736 }
737 double input_percent = 100.0 * total_input_ms / total_step_time_ms;
738 double output_percent = 100.0 * total_output_ms / total_step_time_ms;
739 double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
740 double device_collectives_percent =
741 100.0 * total_device_collectives_ms / total_step_time_ms;
742
743 // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
744 // host compute, host compile, and unknown.
745 double idle_percent =
746 std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
747 device_collectives_percent);
748 double kernel_launch_percent =
749 100.0 * total_host_prepare_ms / total_step_time_ms;
750 double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
751
752 std::string input_classification;
753 std::string input_statement;
754 bool all_other_reported =
755 InputAnalysis(input_percent, all_other_percent, &input_classification,
756 &input_statement);
757
758 std::string device_collectives_classification;
759 std::string device_collectives_statement;
760 DeviceCollectivesAnalysis(device_collectives_percent,
761 &device_collectives_classification,
762 &device_collectives_statement);
763
764 std::string kernel_launch_classification;
765 std::string kernel_launch_statement;
766 KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
767 &kernel_launch_classification, &kernel_launch_statement);
768
769 std::string all_other_classification;
770 std::string all_other_statement;
771 AllOtherAnalysis(all_other_reported, all_other_percent,
772 &all_other_classification, &all_other_statement);
773
774 BottleneckAnalysis analysis;
775 analysis.set_input_percent(input_percent);
776 analysis.set_output_percent(output_percent);
777 analysis.set_idle_percent(idle_percent);
778 analysis.set_compute_percent(compute_percent);
779
780 analysis.set_input_classification(input_classification);
781 analysis.set_input_statement(input_statement);
782 analysis.set_kernel_launch_classification(kernel_launch_classification);
783 analysis.set_kernel_launch_statement(kernel_launch_statement);
784 analysis.set_all_other_classification(all_other_classification);
785 analysis.set_all_other_statement(all_other_statement);
786 analysis.set_device_collectives_classification(
787 device_collectives_classification);
788 analysis.set_device_collectives_statement(device_collectives_statement);
789
790 return analysis;
791 }
792
GetSummaryNextStep(absl::string_view input_classification,const InputTimeBreakdown & breakdown)793 std::string GetSummaryNextStep(absl::string_view input_classification,
794 const InputTimeBreakdown& breakdown) {
795 std::string summary_next_step;
796 if (input_classification == "host" || input_classification == "both") {
797 if (!TfDataInUse(breakdown)) {
798 summary_next_step = absl::StrCat(
799 "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
800 " to enable profiler's host-side analysis for input pipeline. "
801 "Profiler currently does not support custom input pipeline (please "
802 "ignore "
803 "Section ",
804 kHostAnalysisSectionNumber, " below).");
805 } else {
806 summary_next_step =
807 absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
808 " for the breakdown of input time on the host.");
809 }
810 } else {
811 summary_next_step = "You may skip the rest of this page.";
812 }
813
814 return summary_next_step;
815 }
816
HostToDeviceTransferAsPercentOfInputTime(const InputTimeBreakdown & breakdown)817 double HostToDeviceTransferAsPercentOfInputTime(
818 const InputTimeBreakdown& breakdown) {
819 // Thanks to the scaling trick we did in GenerateHostResult(), we can
820 // estimate the percentage of input-time spent on host-to-device transfer in
821 // the following way.
822 double total_input_time_us =
823 breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
824 breakdown.preprocessing_us() + breakdown.enqueue_us() +
825 breakdown.unclassified_non_enqueue_us();
826 return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
827 }
828
829 } // namespace profiler
830 } // namespace tensorflow
831