1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/utils/event_span.h"
16
17 #include <string>
18 #include <utility>
19 #include <vector>
20
21 #include "absl/algorithm/container.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/strings/string_view.h"
26 #include "tensorflow/core/lib/gtl/map_util.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30
31 namespace tensorflow {
32 namespace profiler {
33
34 namespace {
35
36 // Representing a boundary of an event.
37 struct EventBoundary {
38 // Time at this boundary.
39 uint64 time_ps;
40 // Type of the event.
41 EventType type;
42 // True if this is the start of the event; False if this is the end.
43 bool is_start;
EventBoundarytensorflow::profiler::__anon6b3e31860111::EventBoundary44 EventBoundary(uint64 time_ps, EventType type, bool is_start)
45 : time_ps(time_ps), type(type), is_start(is_start) {}
46 };
47
48 // Returns true if EventBoundary a should appear before EventBoundary b.
CmpEventBoundaries(const EventBoundary & a,const EventBoundary & b)49 bool CmpEventBoundaries(const EventBoundary& a, const EventBoundary& b) {
50 if (a.time_ps == b.time_ps) {
51 if (a.is_start == b.is_start) {
52 // Puts the higher-priority type before the lower-priority type if they
53 // have the same time and same boundary type.
54 return a.type > b.type;
55 } else {
56 // Puts the "end" bounary before the "start" boundary if they have the
57 // same time.
58 return !a.is_start;
59 }
60 }
61 // In ascending order of time.
62 return a.time_ps < b.time_ps;
63 }
64
65 // Generates vector of event boundaries from the given overlapped_events.
GenerateEventBoundaries(const std::vector<EventTypeSpan> & overlapped_events)66 std::vector<EventBoundary> GenerateEventBoundaries(
67 const std::vector<EventTypeSpan>& overlapped_events) {
68 std::vector<EventBoundary> boundaries;
69 boundaries.reserve(2 * overlapped_events.size());
70 for (const auto& event : overlapped_events) {
71 boundaries.push_back(
72 {event.span.begin_ps(), event.type, /*is_start=*/true});
73 boundaries.push_back({event.span.end_ps(), event.type, /*is_start=*/false});
74 }
75 absl::c_sort(boundaries, CmpEventBoundaries);
76 return boundaries;
77 }
78
79 // A class to track the highest priority that an event should be assigned.
80 class PriorityTracker {
81 private:
82 // The current maximum priority.
83 EventType current_max_priority_;
84 // A count for each possible priority.
85 std::vector<int64> priority_count_;
86
87 public:
PriorityTracker()88 PriorityTracker() {
89 current_max_priority_ = UNKNOWN_TIME;
90 priority_count_.resize(LAST_EVENT_TYPE + 1, 0);
91 }
92 // Updates current_max_priority_ and priority_count_[] given the boundary.
93 // Returns the new current_max_priority_.
Update(const EventBoundary & boundary)94 EventType Update(const EventBoundary& boundary) {
95 EventType event_type = boundary.type;
96 bool is_start = boundary.is_start;
97 if (is_start) {
98 priority_count_[event_type]++;
99 if (event_type > current_max_priority_) {
100 current_max_priority_ = event_type;
101 }
102 } else {
103 priority_count_[event_type]--;
104 if (event_type == current_max_priority_ &&
105 priority_count_[event_type] == 0) {
106 // Reduces current_max_priority_ to the first event type (starting from
107 // the highest priority) that has a non-zero count.
108 bool found = false;
109 for (int i = event_type - 1; i >= 0; i--) {
110 if (priority_count_[i] > 0) {
111 current_max_priority_ = static_cast<EventType>(i);
112 found = true;
113 break;
114 }
115 }
116 if (!found) current_max_priority_ = UNKNOWN_TIME;
117 }
118 }
119 return current_max_priority_;
120 }
121 };
122
CombineStepDetails(const StepDetails & src,StepDetails * dst)123 void CombineStepDetails(const StepDetails& src, StepDetails* dst) {
124 dst->AppendMarkers(src.Markers());
125 dst->AppendEvents(src.Events());
126 dst->AppendCollectives(src.Collectives());
127 dst->AggregateDeviceMemoryTransfers(src.DeviceMemoryTransfers());
128 }
129
ClassifyDeviceCompute(absl::string_view event_name,absl::string_view tensor_shapes)130 EventType ClassifyDeviceCompute(absl::string_view event_name,
131 absl::string_view tensor_shapes) {
132 if (tensor_shapes.empty()) {
133 // Deduces the precision from the name.
134 if (absl::StrContains(event_name, "half") ||
135 absl::StrContains(event_name, "fp16"))
136 return DEVICE_COMPUTE_16;
137 else
138 return DEVICE_COMPUTE_32;
139 } else {
140 // Deduces the precision from the shapes.
141 if (absl::StrContains(tensor_shapes, "half"))
142 return DEVICE_COMPUTE_16;
143 else
144 return DEVICE_COMPUTE_32;
145 }
146 }
147
148 constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
149 GenericEventType::kFirstGenericEventType +
150 1;
151
152 using GenericEventTypeStrMap =
153 absl::flat_hash_map<GenericEventType, absl::string_view>;
154
GetGenericEventTypeStrMap()155 const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
156 static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
157 {kDeviceCompute, "Device compute"},
158 {kDeviceToDevice, "Device to device"},
159 {kDeviceCollectives, "Device collective communication"},
160 {kHostCompute, "Host compute"},
161 {kHostPrepare, "Kernel launch"},
162 {kInput, "Input"},
163 {kOutput, "Output"},
164 {kCompile, "Compilation"},
165 {kAllOthers, "All others"},
166 });
167 DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
168 return *generic_event_type_str_map;
169 }
170
171 } // namespace
172
GetGenericEventTypeStr(GenericEventType event_type)173 absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
174 return GetGenericEventTypeStrMap().at(event_type);
175 }
176
ClassifyGpuEvent(absl::string_view event_name,absl::string_view tensor_shapes)177 EventType ClassifyGpuEvent(absl::string_view event_name,
178 absl::string_view tensor_shapes) {
179 if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD"))
180 return HOST_TO_DEVICE;
181 if (absl::StartsWithIgnoreCase(event_name, "MEMCPYDtoH"))
182 return DEVICE_TO_HOST;
183 if (absl::StartsWithIgnoreCase(event_name, "MEMCPYDtoD"))
184 return DEVICE_TO_DEVICE;
185 if (absl::StartsWithIgnoreCase(event_name, "nccl")) {
186 return DEVICE_COLLECTIVES;
187 }
188 return ClassifyDeviceCompute(event_name, tensor_shapes);
189 }
190
ClassifyCpuEvent(absl::string_view event_name,int64 correlation_id,bool has_device)191 EventType ClassifyCpuEvent(absl::string_view event_name, int64 correlation_id,
192 bool has_device) {
193 if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD") ||
194 absl::StrContains(event_name, "Infeed"))
195 return HOST_TO_DEVICE;
196 if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoH")) return HOST_TO_HOST;
197 // TODO(b/150420972): Separate runtime overhead from actual compute for
198 // CPU-only.
199 if (has_device &&
200 (correlation_id >= 0 ||
201 absl::StartsWithIgnoreCase(event_name, "ExecutorState::Process"))) {
202 return HOST_PREPARE;
203 }
204 if (absl::StartsWithIgnoreCase(event_name, "IteratorGetNext"))
205 return HOST_WAIT_INPUT;
206 return HOST_COMPUTE;
207 }
208
PrintEventType(EventType event_type)209 std::string PrintEventType(EventType event_type) {
210 switch (event_type) {
211 case UNKNOWN_TIME:
212 return "unknown_time";
213 case HOST_COMPUTE:
214 return "host_compute";
215 case HOST_COMPILE:
216 return "host_compile";
217 case HOST_TO_HOST:
218 return "host_to_host";
219 case HOST_TO_DEVICE:
220 return "host_to_device";
221 case HOST_PREPARE:
222 return "host_prepare";
223 case DEVICE_COLLECTIVES:
224 return "device_collectives";
225 case HOST_WAIT_INPUT:
226 return "host_wait_input";
227 case DEVICE_TO_DEVICE:
228 return "device_to_device";
229 case DEVICE_TO_HOST:
230 return "device_to_host";
231 case DEVICE_COMPUTE_32:
232 return "device_compute_32";
233 case DEVICE_COMPUTE_16:
234 return "device_compute_16";
235 case DEVICE_WAIT_DEVICE:
236 return "device_wait_device";
237 case DEVICE_WAIT_HOST:
238 return "device_wait_host";
239 default:
240 return "unexpected";
241 }
242 }
243
PrintEventTypeSpan(const EventTypeSpan & event_type_span)244 std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
245 return absl::StrCat("(", PrintEventType(event_type_span.type), ", ",
246 event_type_span.span.DebugString(), ")");
247 }
248
PrintStepMarkerType(StepMarkerType type)249 absl::string_view PrintStepMarkerType(StepMarkerType type) {
250 switch (type) {
251 case StepMarkerType::kExplicitHostStepMarker:
252 return "ExplicitHostStepMarker";
253 case StepMarkerType::kImplicitHostStepMarker:
254 return "ImplicitHostStepMarker";
255 case StepMarkerType::kDeviceStepMarker:
256 return "DeviceStepMarker";
257 }
258 }
259
PrintStepMarker(const StepMarker & step_marker)260 std::string PrintStepMarker(const StepMarker& step_marker) {
261 return absl::StrCat("(", PrintStepMarkerType(step_marker.type), ", ",
262 step_marker.event_name, ", ",
263 step_marker.span.DebugString(), ")");
264 }
265
PrintStepEvents(const StepEvents & step_events)266 std::string PrintStepEvents(const StepEvents& step_events) {
267 std::vector<int64> step_ids;
268 step_ids.reserve(step_events.size());
269 for (const auto& id_details : step_events) {
270 step_ids.push_back(id_details.first);
271 }
272 absl::c_sort(step_ids);
273 std::string result = "{";
274 for (auto id : step_ids) {
275 absl::StrAppend(&result, "\n");
276 auto* details = gtl::FindOrNull(step_events, id);
277 std::string details_str = details ? details->DebugString() : "()";
278 absl::StrAppend(&result, id, ":", details_str);
279 }
280 return absl::StrCat(result, "\n}");
281 }
282
CombineStepEvents(const StepEvents & src,StepEvents * dst)283 void CombineStepEvents(const StepEvents& src, StepEvents* dst) {
284 for (const auto& step_details : src) {
285 int64 step_id = step_details.first;
286 const StepDetails& src_details = step_details.second;
287 StepDetails* dst_details = &(*dst)[step_id];
288 CombineStepDetails(src_details, dst_details);
289 }
290 }
291
ToNonOverlappedEvents(const std::vector<EventTypeSpan> & overlapped_events)292 std::vector<EventTypeSpan> ToNonOverlappedEvents(
293 const std::vector<EventTypeSpan>& overlapped_events) {
294 std::vector<EventBoundary> event_boundaries =
295 GenerateEventBoundaries(overlapped_events);
296 std::vector<EventTypeSpan> result;
297 if (event_boundaries.empty()) return result;
298 result.reserve(event_boundaries.size());
299 PriorityTracker priority_tracker;
300 for (int64 i = 0, end = (event_boundaries.size() - 1); i < end; i++) {
301 EventType highest_priority = priority_tracker.Update(event_boundaries[i]);
302 result.push_back({highest_priority, Timespan::FromEndPoints(
303 event_boundaries[i].time_ps,
304 event_boundaries[i + 1].time_ps)});
305 }
306 return result;
307 }
308
309 // Converts from overlapped step-events to non-overlapped step-events.
ToNonOverlappedStepEvents(const StepEvents & overlapped_step_events)310 StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
311 StepEvents non_overlapped_step_events;
312 for (const auto& step_events : overlapped_step_events) {
313 const auto& step_id = step_events.first;
314 const auto& step_details = step_events.second;
315 *non_overlapped_step_events[step_id].MutableMarkers() =
316 step_details.Markers();
317 *non_overlapped_step_events[step_id].MutableEvents() =
318 ToNonOverlappedEvents(step_details.Events());
319 *non_overlapped_step_events[step_id].MutableCollectives() =
320 step_details.Collectives();
321 *non_overlapped_step_events[step_id].MutableDeviceMemoryTransfers() =
322 step_details.DeviceMemoryTransfers();
323 }
324 return non_overlapped_step_events;
325 }
326
AddMarker(const StepMarker & m)327 void StepDetails::AddMarker(const StepMarker& m) { markers_.push_back(m); }
328
AddEvent(const EventTypeSpan & e)329 void StepDetails::AddEvent(const EventTypeSpan& e) { events_.push_back(e); }
330
AppendMarkers(const std::vector<StepMarker> & other_markers)331 void StepDetails::AppendMarkers(const std::vector<StepMarker>& other_markers) {
332 markers_.insert(markers_.end(), other_markers.begin(), other_markers.end());
333 }
334
AppendEvents(const std::vector<EventTypeSpan> & other_events)335 void StepDetails::AppendEvents(const std::vector<EventTypeSpan>& other_events) {
336 events_.insert(events_.end(), other_events.begin(), other_events.end());
337 }
338
AppendCollectives(const absl::flat_hash_map<uint32,AllReduceDbResult> & collectives)339 void StepDetails::AppendCollectives(
340 const absl::flat_hash_map<uint32, AllReduceDbResult>& collectives) {
341 for (const auto& it : collectives) {
342 collectives_[it.first] = it.second;
343 }
344 }
345
AggregateDeviceMemoryTransfers(const std::vector<DeviceMemoryTransfer> device_memory_transfers)346 void StepDetails::AggregateDeviceMemoryTransfers(
347 const std::vector<DeviceMemoryTransfer> device_memory_transfers) {
348 if (device_memory_transfers.size() != device_memory_transfers_.size()) {
349 return; // Sanity check.
350 }
351 for (size_t i = 0; i < device_memory_transfers.size(); ++i) {
352 device_memory_transfers_[i].set_occurrence(
353 device_memory_transfers_[i].occurrence() +
354 device_memory_transfers[i].occurrence());
355 device_memory_transfers_[i].set_bytes_transferred(
356 device_memory_transfers_[i].bytes_transferred() +
357 device_memory_transfers[i].bytes_transferred());
358 device_memory_transfers_[i].set_time_us(
359 device_memory_transfers_[i].time_us() +
360 device_memory_transfers[i].time_us());
361 }
362 }
363
AddCollectiveOpEvent(uint64 core_id,const AllReduceInfo & e)364 void StepDetails::AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e) {
365 *collectives_[core_id].add_all_reduce_info() = e;
366 }
367
AddDeviceMemoryTransferEvent(EventType event_type,const Timespan & time_span,uint64 bytes)368 void StepDetails::AddDeviceMemoryTransferEvent(EventType event_type,
369 const Timespan& time_span,
370 uint64 bytes) {
371 int index = 0;
372 switch (event_type) {
373 case HOST_TO_DEVICE:
374 index = 0;
375 break;
376 case DEVICE_TO_HOST:
377 index = 1;
378 break;
379 case DEVICE_TO_DEVICE:
380 index = 2;
381 break;
382 default:
383 return;
384 }
385 device_memory_transfers_[index].set_occurrence(
386 device_memory_transfers_[index].occurrence() + 1);
387 device_memory_transfers_[index].set_time_us(
388 device_memory_transfers_[index].time_us() +
389 time_span.duration_ps() / 1000000.0);
390 device_memory_transfers_[index].set_bytes_transferred(
391 device_memory_transfers_[index].bytes_transferred() + bytes);
392 }
393
StepTime() const394 Timespan StepDetails::StepTime() const {
395 Timespan max_host_step_time;
396 Timespan max_device_step_time;
397 for (const auto& marker : markers_) {
398 Timespan& cur_max_step_time =
399 marker.type == StepMarkerType::kDeviceStepMarker ? max_device_step_time
400 : max_host_step_time;
401 const Timespan& new_step_time = marker.span;
402 if (new_step_time.duration_ps() > cur_max_step_time.duration_ps())
403 cur_max_step_time = new_step_time;
404 }
405 // CPU-only profile.
406 if (max_device_step_time.Empty()) {
407 return max_host_step_time;
408 }
409
410 // If the host step time includes the device step time, use the host step
411 // time. This covers the case where the device is synchronized at the end of
412 // each step.
413 if (max_host_step_time.Includes(max_device_step_time)) {
414 return max_host_step_time;
415 }
416 return max_device_step_time;
417 }
418
DebugString() const419 std::string StepDetails::DebugString() const {
420 std::string result = "([";
421 for (int i = 0, end = markers_.size(); i < end; i++) {
422 if (i > 0) absl::StrAppend(&result, ", ");
423 absl::StrAppend(&result, PrintStepMarker(markers_[i]));
424 }
425 absl::StrAppend(&result, "], [");
426 for (int i = 0, end = events_.size(); i < end; i++) {
427 if (i > 0) absl::StrAppend(&result, ", ");
428 absl::StrAppend(&result, PrintEventTypeSpan(events_[i]));
429 }
430 return absl::StrCat(result, "])");
431 }
432
operator ==(const StepDetails & other) const433 bool StepDetails::operator==(const StepDetails& other) const {
434 const auto& other_markers = other.Markers();
435 if (markers_.size() != other_markers.size()) return false;
436 for (uint64 i = 0; i < markers_.size(); i++) {
437 if (markers_[i] != other_markers[i]) return false;
438 }
439 const auto& other_events = other.Events();
440 if (events_.size() != other_events.size()) return false;
441 for (uint64 i = 0; i < events_.size(); i++) {
442 if (events_[i] != other_events[i]) return false;
443 }
444 return true;
445 }
446
operator ==(const StepEvents & a,const StepEvents & b)447 bool operator==(const StepEvents& a, const StepEvents& b) {
448 if (a.size() != b.size()) return false;
449 for (const auto& id_details : a) {
450 const auto a_id = id_details.first;
451 const auto& a_details = id_details.second;
452 const auto* b_details = gtl::FindOrNull(b, a_id);
453 if (b_details == nullptr) return false;
454 if (a_details != *b_details) return false;
455 }
456 return true;
457 }
458
ComputePrecisionStats(const StepEvents & nonoverlapped_step_events)459 PrecisionStats ComputePrecisionStats(
460 const StepEvents& nonoverlapped_step_events) {
461 int64 compute_32bit_ps = 0;
462 int64 compute_16bit_ps = 0;
463 for (const auto& id_details : nonoverlapped_step_events) {
464 for (const auto& event : id_details.second.Events()) {
465 switch (event.type) {
466 case DEVICE_COMPUTE_32:
467 compute_32bit_ps += event.span.duration_ps();
468 break;
469 case DEVICE_COMPUTE_16:
470 compute_16bit_ps += event.span.duration_ps();
471 break;
472 default:
473 break;
474 }
475 }
476 }
477 PrecisionStats precision_stats;
478 precision_stats.set_compute_32bit_ps(compute_32bit_ps);
479 precision_stats.set_compute_16bit_ps(compute_16bit_ps);
480 return precision_stats;
481 }
482
483 } // namespace profiler
484 } // namespace tensorflow
485