1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/diagnostics.h"
17 
18 #include "absl/algorithm/container.h"
19 #include "absl/strings/str_cat.h"
20 #include "absl/strings/string_view.h"
21 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
22 
23 namespace tensorflow {
24 namespace profiler {
25 
26 const absl::string_view kErrorIncompleteStep =
27     "Incomplete step observed and hence the step time is unknown."
28     "Instead, we use the trace duration as the step time. This may happen"
29     " if your profiling duration is shorter than the step time. In this"
30     " case, you may try to profile longer.";
31 
32 const absl::string_view kErrorNoStepMarker =
33     "No step marker observed and hence the step time is unknown."
34     " This may happen if (1) training steps are not instrumented (e.g., if"
35     " you are not using Keras) or (2) the profiling duration is shorter"
36     " than the step time. For (1), you need to add step instrumentation;"
37     " for (2), you may try to profile longer.";
38 
39 const absl::string_view kNoDeviceTraceCollected =
40     "No device trace was collected. This might happen if your job hadn't been "
41     "run on the device when sampling was turned on. You could try the sampling"
42     " again later.";
43 
44 const absl::string_view kStepsDropped =
45     " steps dropped. This might happen when you profile many hosts and/or many "
46     "steps. You could try to profile shorter or reduce the number of hosts "
47     "you profile.";
48 
PopulateStepDiagnostics(const OpStats & op_stats,Diagnostics * diag)49 void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
50   if (op_stats.step_db().use_incomplete_step()) {
51     *diag->add_warnings() = std::string(kErrorIncompleteStep);
52   } else if (op_stats.step_db().step_sequence().empty()) {
53     *diag->add_warnings() = std::string(kErrorNoStepMarker);
54   }
55   if (op_stats.step_db().num_steps_dropped()) {
56     *diag->add_warnings() =
57         absl::StrCat(op_stats.step_db().num_steps_dropped(), kStepsDropped);
58   }
59 }
60 
PopulateOverviewDiagnostics(const OpStats & op_stats,Diagnostics * diag)61 void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
62   *diag->mutable_errors() = op_stats.diagnostics().errors();
63   absl::c_sort(*diag->mutable_errors());
64   if (diag->errors().empty()) {
65     // Shows run-environment error only if there is no other existing error.
66     if (op_stats.run_environment().device_type() != "CPU" &&
67         op_stats.run_environment().device_core_count() <= 0) {
68       *diag->add_errors() = std::string(kNoDeviceTraceCollected);
69     }
70   }
71   *diag->mutable_warnings() = op_stats.diagnostics().warnings();
72   PopulateStepDiagnostics(op_stats, diag);
73 }
74 
75 }  // namespace profiler
76 }  // namespace tensorflow
77