1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
16
17 #include <sstream>
18 #include <utility>
19 #include <vector>
20
21 #include "google/protobuf/any.pb.h"
22 #include "absl/algorithm/container.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "tensorflow/core/lib/gtl/map_util.h"
25 #include "tensorflow/core/platform/logging.h"
26 #include "tensorflow/core/platform/types.h"
27 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
28 #include "tensorflow/core/profiler/utils/event_span.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30
31 namespace tensorflow {
32 namespace profiler {
33
34 // Local core id should start from 1.
35 const uint32 kDefaultGpuLocalCoreId = 1;
36
37 namespace {
38
39 // Converts from StepDetails to StepInfoResult.
ConvertStepDetailsToStepInfo(bool has_device,int64 step_num,const StepDetails & step_details)40 StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64 step_num,
41 const StepDetails& step_details) {
42 GenericStepBreakdown generic;
43 Timespan step_time = step_details.StepTime();
44 auto& type_ps = *(generic.mutable_type_ps());
45 uint64 total_event_duration = 0;
46 for (const auto& event : step_details.Events()) {
47 // Ignore event duration outside the step marker.
48 uint64 event_duration = step_time.OverlappedDurationPs(event.span);
49 type_ps[event.type] += event_duration;
50 total_event_duration += event_duration;
51 }
52 if (total_event_duration < step_time.duration_ps()) {
53 // Some time in the step is not associated with any event. Classify them as
54 // "unknown time".
55 type_ps[UNKNOWN_TIME] += step_time.duration_ps() - total_event_duration;
56 }
57 // Determines if this particular step is a well-formed one.
58 bool well_formed_step = has_device ? (type_ps.contains(DEVICE_COMPUTE_16) ||
59 type_ps.contains(DEVICE_COMPUTE_32))
60 : type_ps.contains(HOST_COMPUTE);
61 StepInfoResult step_info;
62 step_info.mutable_step_breakdown()->PackFrom(generic);
63 if (well_formed_step) {
64 step_info.set_step_num(step_num);
65 step_info.set_begin_ps(step_time.begin_ps());
66 step_info.set_duration_ps(step_time.duration_ps());
67 } else {
68 // For a non-well-formed step, sets its duration to 0 so that it will be
69 // ignored by the caller of this function.
70 step_info.set_duration_ps(0);
71 }
72 return step_info;
73 }
74
DebugGenericStepBreakdown(const GenericStepBreakdown & generic)75 string DebugGenericStepBreakdown(const GenericStepBreakdown& generic) {
76 std::ostringstream out;
77 uint64 total_ps = 0;
78 const auto& type_ps_map = generic.type_ps();
79 for (const auto& type_ps : type_ps_map) {
80 total_ps += type_ps.second;
81 }
82 out << "Total ps = " << total_ps << std::endl;
83 for (int type = LAST_EVENT_TYPE; type >= 0; --type) {
84 const auto* ps = gtl::FindOrNull(type_ps_map, type);
85 if (ps == nullptr) continue;
86 double percent = (*ps * 100.0) / total_ps;
87 auto event_type = static_cast<EventType>(type);
88 out << PrintEventType(event_type) << ": " << percent << "%"
89 << ", ps = " << *ps << std::endl;
90 }
91 return out.str();
92 }
93
DebugStepInfo(const StepInfoResult & step_info)94 string DebugStepInfo(const StepInfoResult& step_info) {
95 std::ostringstream out;
96 out << "step_num=" << step_info.step_num()
97 << ", duration_ps=" << step_info.duration_ps()
98 << ", begin_ps=" << step_info.begin_ps() << std::endl;
99 GenericStepBreakdown generic;
100 if (step_info.step_breakdown().UnpackTo(&generic)) {
101 out << "Generic step breakdown:" << std::endl;
102 out << DebugGenericStepBreakdown(generic) << std::endl;
103 } else {
104 out << step_info.step_breakdown().DebugString() << std::endl;
105 }
106 return out.str();
107 }
108
109 } // namespace
110
ConvertStepEventsToStepDb(bool has_device,bool maybe_drop_incomplete_steps,const StepEvents & nonoverlapped_step_events)111 StepDatabaseResult ConvertStepEventsToStepDb(
112 bool has_device, bool maybe_drop_incomplete_steps,
113 const StepEvents& nonoverlapped_step_events) {
114 StepDatabaseResult step_db;
115 // Gets sorted step numbers.
116 std::vector<int64> step_numbers;
117 step_numbers.reserve(nonoverlapped_step_events.size());
118 for (const auto& step_events : nonoverlapped_step_events) {
119 step_numbers.push_back(step_events.first);
120 }
121 absl::c_sort(step_numbers);
122 for (const auto& step : step_numbers) {
123 const auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
124 if (step_details == nullptr) continue;
125 StepInfoResult step_info =
126 ConvertStepDetailsToStepInfo(has_device, step, *step_details);
127 if (step_info.duration_ps() == 0)
128 continue; // Do not include non-well-formed steps.
129 PerCoreStepInfo per_core_step_info;
130 per_core_step_info.set_step_num(step);
131 // When we generated StepEvents, we already put events from all device
132 // cores and cpu threads on this host into a single event stream, therefore
133 // we can't separate them anymore. Simply assigns all events to Core-0.
134 (*per_core_step_info.mutable_step_info_per_core())[kDefaultGpuLocalCoreId] =
135 std::move(step_info);
136 VLOG(2) << std::endl
137 << "step_id: " << step << ", step_info:" << std::endl
138 << DebugStepInfo((
139 *per_core_step_info
140 .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
141 // Populates the collective ops information.
142 auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
143 for (const auto& it : step_details->Collectives()) {
144 collectives[it.first] = it.second;
145 }
146 // Populates the device transfer stats for this step.
147 auto& device_memory_transfers =
148 *per_core_step_info.mutable_device_memory_transfers();
149 for (const auto& dma : step_details->DeviceMemoryTransfers()) {
150 *device_memory_transfers.Add() = dma;
151 }
152 // The remaining fields in PerCoreStepInfo are not filled.
153 *step_db.add_step_sequence() = per_core_step_info;
154 }
155
156 // If we are using sampling mode and we get enough steps, we would like to
157 // drop the incomplete steps at the beginning and the end.
158 // (Sometimes CUTPI instrumentation will prolong the first step too).
159 int kDropIncomplteteStepThreshold = 5;
160 if (maybe_drop_incomplete_steps &&
161 step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
162 step_db.mutable_step_sequence()->erase(
163 step_db.mutable_step_sequence()->begin());
164 step_db.mutable_step_sequence()->RemoveLast();
165 }
166 return step_db;
167 }
168
169 } // namespace profiler
170 } // namespace tensorflow
171