1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/utils/event_span.h"
16 
17 #include <string>
18 #include <utility>
19 #include <vector>
20 
21 #include "absl/algorithm/container.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/strings/string_view.h"
26 #include "tensorflow/core/lib/gtl/map_util.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30 
31 namespace tensorflow {
32 namespace profiler {
33 
34 namespace {
35 
36 // Representing a boundary of an event.
37 struct EventBoundary {
38   // Time at this boundary.
39   uint64 time_ps;
40   // Type of the event.
41   EventType type;
42   // True if this is the start of the event; False if this is the end.
43   bool is_start;
EventBoundarytensorflow::profiler::__anon6b3e31860111::EventBoundary44   EventBoundary(uint64 time_ps, EventType type, bool is_start)
45       : time_ps(time_ps), type(type), is_start(is_start) {}
46 };
47 
48 // Returns true if EventBoundary a should appear before EventBoundary b.
CmpEventBoundaries(const EventBoundary & a,const EventBoundary & b)49 bool CmpEventBoundaries(const EventBoundary& a, const EventBoundary& b) {
50   if (a.time_ps == b.time_ps) {
51     if (a.is_start == b.is_start) {
52       // Puts the higher-priority type before the lower-priority type if they
53       // have the same time and same boundary type.
54       return a.type > b.type;
55     } else {
56       // Puts the "end" bounary before the "start" boundary if they have the
57       // same time.
58       return !a.is_start;
59     }
60   }
61   // In ascending order of time.
62   return a.time_ps < b.time_ps;
63 }
64 
65 // Generates vector of event boundaries from the given overlapped_events.
GenerateEventBoundaries(const std::vector<EventTypeSpan> & overlapped_events)66 std::vector<EventBoundary> GenerateEventBoundaries(
67     const std::vector<EventTypeSpan>& overlapped_events) {
68   std::vector<EventBoundary> boundaries;
69   boundaries.reserve(2 * overlapped_events.size());
70   for (const auto& event : overlapped_events) {
71     boundaries.push_back(
72         {event.span.begin_ps(), event.type, /*is_start=*/true});
73     boundaries.push_back({event.span.end_ps(), event.type, /*is_start=*/false});
74   }
75   absl::c_sort(boundaries, CmpEventBoundaries);
76   return boundaries;
77 }
78 
79 // A class to track the highest priority that an event should be assigned.
80 class PriorityTracker {
81  private:
82   // The current maximum priority.
83   EventType current_max_priority_;
84   // A count for each possible priority.
85   std::vector<int64> priority_count_;
86 
87  public:
PriorityTracker()88   PriorityTracker() {
89     current_max_priority_ = UNKNOWN_TIME;
90     priority_count_.resize(LAST_EVENT_TYPE + 1, 0);
91   }
92   // Updates current_max_priority_ and priority_count_[] given the boundary.
93   // Returns the new current_max_priority_.
Update(const EventBoundary & boundary)94   EventType Update(const EventBoundary& boundary) {
95     EventType event_type = boundary.type;
96     bool is_start = boundary.is_start;
97     if (is_start) {
98       priority_count_[event_type]++;
99       if (event_type > current_max_priority_) {
100         current_max_priority_ = event_type;
101       }
102     } else {
103       priority_count_[event_type]--;
104       if (event_type == current_max_priority_ &&
105           priority_count_[event_type] == 0) {
106         // Reduces current_max_priority_ to the first event type (starting from
107         // the highest priority) that has a non-zero count.
108         bool found = false;
109         for (int i = event_type - 1; i >= 0; i--) {
110           if (priority_count_[i] > 0) {
111             current_max_priority_ = static_cast<EventType>(i);
112             found = true;
113             break;
114           }
115         }
116         if (!found) current_max_priority_ = UNKNOWN_TIME;
117       }
118     }
119     return current_max_priority_;
120   }
121 };
122 
CombineStepDetails(const StepDetails & src,StepDetails * dst)123 void CombineStepDetails(const StepDetails& src, StepDetails* dst) {
124   dst->AppendMarkers(src.Markers());
125   dst->AppendEvents(src.Events());
126   dst->AppendCollectives(src.Collectives());
127   dst->AggregateDeviceMemoryTransfers(src.DeviceMemoryTransfers());
128 }
129 
ClassifyDeviceCompute(absl::string_view event_name,absl::string_view tensor_shapes)130 EventType ClassifyDeviceCompute(absl::string_view event_name,
131                                 absl::string_view tensor_shapes) {
132   if (tensor_shapes.empty()) {
133     // Deduces the precision from the name.
134     if (absl::StrContains(event_name, "half") ||
135         absl::StrContains(event_name, "fp16"))
136       return DEVICE_COMPUTE_16;
137     else
138       return DEVICE_COMPUTE_32;
139   } else {
140     // Deduces the precision from the shapes.
141     if (absl::StrContains(tensor_shapes, "half"))
142       return DEVICE_COMPUTE_16;
143     else
144       return DEVICE_COMPUTE_32;
145   }
146 }
147 
148 constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
149                                       GenericEventType::kFirstGenericEventType +
150                                       1;
151 
152 using GenericEventTypeStrMap =
153     absl::flat_hash_map<GenericEventType, absl::string_view>;
154 
GetGenericEventTypeStrMap()155 const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
156   static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
157       {kDeviceCompute, "Device compute"},
158       {kDeviceToDevice, "Device to device"},
159       {kDeviceCollectives, "Device collective communication"},
160       {kHostCompute, "Host compute"},
161       {kHostPrepare, "Kernel launch"},
162       {kInput, "Input"},
163       {kOutput, "Output"},
164       {kCompile, "Compilation"},
165       {kAllOthers, "All others"},
166   });
167   DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
168   return *generic_event_type_str_map;
169 }
170 
171 }  // namespace
172 
GetGenericEventTypeStr(GenericEventType event_type)173 absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
174   return GetGenericEventTypeStrMap().at(event_type);
175 }
176 
ClassifyGpuEvent(absl::string_view event_name,absl::string_view tensor_shapes)177 EventType ClassifyGpuEvent(absl::string_view event_name,
178                            absl::string_view tensor_shapes) {
179   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD"))
180     return HOST_TO_DEVICE;
181   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYDtoH"))
182     return DEVICE_TO_HOST;
183   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYDtoD"))
184     return DEVICE_TO_DEVICE;
185   if (absl::StartsWithIgnoreCase(event_name, "nccl")) {
186     return DEVICE_COLLECTIVES;
187   }
188   return ClassifyDeviceCompute(event_name, tensor_shapes);
189 }
190 
ClassifyCpuEvent(absl::string_view event_name,int64 correlation_id,bool has_device)191 EventType ClassifyCpuEvent(absl::string_view event_name, int64 correlation_id,
192                            bool has_device) {
193   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD") ||
194       absl::StrContains(event_name, "Infeed"))
195     return HOST_TO_DEVICE;
196   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoH")) return HOST_TO_HOST;
197   // TODO(b/150420972): Separate runtime overhead from actual compute for
198   // CPU-only.
199   if (has_device &&
200       (correlation_id >= 0 ||
201        absl::StartsWithIgnoreCase(event_name, "ExecutorState::Process"))) {
202     return HOST_PREPARE;
203   }
204   if (absl::StartsWithIgnoreCase(event_name, "IteratorGetNext"))
205     return HOST_WAIT_INPUT;
206   return HOST_COMPUTE;
207 }
208 
PrintEventType(EventType event_type)209 std::string PrintEventType(EventType event_type) {
210   switch (event_type) {
211     case UNKNOWN_TIME:
212       return "unknown_time";
213     case HOST_COMPUTE:
214       return "host_compute";
215     case HOST_COMPILE:
216       return "host_compile";
217     case HOST_TO_HOST:
218       return "host_to_host";
219     case HOST_TO_DEVICE:
220       return "host_to_device";
221     case HOST_PREPARE:
222       return "host_prepare";
223     case DEVICE_COLLECTIVES:
224       return "device_collectives";
225     case HOST_WAIT_INPUT:
226       return "host_wait_input";
227     case DEVICE_TO_DEVICE:
228       return "device_to_device";
229     case DEVICE_TO_HOST:
230       return "device_to_host";
231     case DEVICE_COMPUTE_32:
232       return "device_compute_32";
233     case DEVICE_COMPUTE_16:
234       return "device_compute_16";
235     case DEVICE_WAIT_DEVICE:
236       return "device_wait_device";
237     case DEVICE_WAIT_HOST:
238       return "device_wait_host";
239     default:
240       return "unexpected";
241   }
242 }
243 
PrintEventTypeSpan(const EventTypeSpan & event_type_span)244 std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
245   return absl::StrCat("(", PrintEventType(event_type_span.type), ", ",
246                       event_type_span.span.DebugString(), ")");
247 }
248 
PrintStepMarkerType(StepMarkerType type)249 absl::string_view PrintStepMarkerType(StepMarkerType type) {
250   switch (type) {
251     case StepMarkerType::kExplicitHostStepMarker:
252       return "ExplicitHostStepMarker";
253     case StepMarkerType::kImplicitHostStepMarker:
254       return "ImplicitHostStepMarker";
255     case StepMarkerType::kDeviceStepMarker:
256       return "DeviceStepMarker";
257   }
258 }
259 
PrintStepMarker(const StepMarker & step_marker)260 std::string PrintStepMarker(const StepMarker& step_marker) {
261   return absl::StrCat("(", PrintStepMarkerType(step_marker.type), ", ",
262                       step_marker.event_name, ", ",
263                       step_marker.span.DebugString(), ")");
264 }
265 
PrintStepEvents(const StepEvents & step_events)266 std::string PrintStepEvents(const StepEvents& step_events) {
267   std::vector<int64> step_ids;
268   step_ids.reserve(step_events.size());
269   for (const auto& id_details : step_events) {
270     step_ids.push_back(id_details.first);
271   }
272   absl::c_sort(step_ids);
273   std::string result = "{";
274   for (auto id : step_ids) {
275     absl::StrAppend(&result, "\n");
276     auto* details = gtl::FindOrNull(step_events, id);
277     std::string details_str = details ? details->DebugString() : "()";
278     absl::StrAppend(&result, id, ":", details_str);
279   }
280   return absl::StrCat(result, "\n}");
281 }
282 
CombineStepEvents(const StepEvents & src,StepEvents * dst)283 void CombineStepEvents(const StepEvents& src, StepEvents* dst) {
284   for (const auto& step_details : src) {
285     int64 step_id = step_details.first;
286     const StepDetails& src_details = step_details.second;
287     StepDetails* dst_details = &(*dst)[step_id];
288     CombineStepDetails(src_details, dst_details);
289   }
290 }
291 
ToNonOverlappedEvents(const std::vector<EventTypeSpan> & overlapped_events)292 std::vector<EventTypeSpan> ToNonOverlappedEvents(
293     const std::vector<EventTypeSpan>& overlapped_events) {
294   std::vector<EventBoundary> event_boundaries =
295       GenerateEventBoundaries(overlapped_events);
296   std::vector<EventTypeSpan> result;
297   if (event_boundaries.empty()) return result;
298   result.reserve(event_boundaries.size());
299   PriorityTracker priority_tracker;
300   for (int64 i = 0, end = (event_boundaries.size() - 1); i < end; i++) {
301     EventType highest_priority = priority_tracker.Update(event_boundaries[i]);
302     result.push_back({highest_priority, Timespan::FromEndPoints(
303                                             event_boundaries[i].time_ps,
304                                             event_boundaries[i + 1].time_ps)});
305   }
306   return result;
307 }
308 
309 // Converts from overlapped step-events to non-overlapped step-events.
ToNonOverlappedStepEvents(const StepEvents & overlapped_step_events)310 StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
311   StepEvents non_overlapped_step_events;
312   for (const auto& step_events : overlapped_step_events) {
313     const auto& step_id = step_events.first;
314     const auto& step_details = step_events.second;
315     *non_overlapped_step_events[step_id].MutableMarkers() =
316         step_details.Markers();
317     *non_overlapped_step_events[step_id].MutableEvents() =
318         ToNonOverlappedEvents(step_details.Events());
319     *non_overlapped_step_events[step_id].MutableCollectives() =
320         step_details.Collectives();
321     *non_overlapped_step_events[step_id].MutableDeviceMemoryTransfers() =
322         step_details.DeviceMemoryTransfers();
323   }
324   return non_overlapped_step_events;
325 }
326 
AddMarker(const StepMarker & m)327 void StepDetails::AddMarker(const StepMarker& m) { markers_.push_back(m); }
328 
AddEvent(const EventTypeSpan & e)329 void StepDetails::AddEvent(const EventTypeSpan& e) { events_.push_back(e); }
330 
AppendMarkers(const std::vector<StepMarker> & other_markers)331 void StepDetails::AppendMarkers(const std::vector<StepMarker>& other_markers) {
332   markers_.insert(markers_.end(), other_markers.begin(), other_markers.end());
333 }
334 
AppendEvents(const std::vector<EventTypeSpan> & other_events)335 void StepDetails::AppendEvents(const std::vector<EventTypeSpan>& other_events) {
336   events_.insert(events_.end(), other_events.begin(), other_events.end());
337 }
338 
AppendCollectives(const absl::flat_hash_map<uint32,AllReduceDbResult> & collectives)339 void StepDetails::AppendCollectives(
340     const absl::flat_hash_map<uint32, AllReduceDbResult>& collectives) {
341   for (const auto& it : collectives) {
342     collectives_[it.first] = it.second;
343   }
344 }
345 
AggregateDeviceMemoryTransfers(const std::vector<DeviceMemoryTransfer> device_memory_transfers)346 void StepDetails::AggregateDeviceMemoryTransfers(
347     const std::vector<DeviceMemoryTransfer> device_memory_transfers) {
348   if (device_memory_transfers.size() != device_memory_transfers_.size()) {
349     return;  // Sanity check.
350   }
351   for (size_t i = 0; i < device_memory_transfers.size(); ++i) {
352     device_memory_transfers_[i].set_occurrence(
353         device_memory_transfers_[i].occurrence() +
354         device_memory_transfers[i].occurrence());
355     device_memory_transfers_[i].set_bytes_transferred(
356         device_memory_transfers_[i].bytes_transferred() +
357         device_memory_transfers[i].bytes_transferred());
358     device_memory_transfers_[i].set_time_us(
359         device_memory_transfers_[i].time_us() +
360         device_memory_transfers[i].time_us());
361   }
362 }
363 
AddCollectiveOpEvent(uint64 core_id,const AllReduceInfo & e)364 void StepDetails::AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e) {
365   *collectives_[core_id].add_all_reduce_info() = e;
366 }
367 
AddDeviceMemoryTransferEvent(EventType event_type,const Timespan & time_span,uint64 bytes)368 void StepDetails::AddDeviceMemoryTransferEvent(EventType event_type,
369                                                const Timespan& time_span,
370                                                uint64 bytes) {
371   int index = 0;
372   switch (event_type) {
373     case HOST_TO_DEVICE:
374       index = 0;
375       break;
376     case DEVICE_TO_HOST:
377       index = 1;
378       break;
379     case DEVICE_TO_DEVICE:
380       index = 2;
381       break;
382     default:
383       return;
384   }
385   device_memory_transfers_[index].set_occurrence(
386       device_memory_transfers_[index].occurrence() + 1);
387   device_memory_transfers_[index].set_time_us(
388       device_memory_transfers_[index].time_us() +
389       time_span.duration_ps() / 1000000.0);
390   device_memory_transfers_[index].set_bytes_transferred(
391       device_memory_transfers_[index].bytes_transferred() + bytes);
392 }
393 
StepTime() const394 Timespan StepDetails::StepTime() const {
395   Timespan max_host_step_time;
396   Timespan max_device_step_time;
397   for (const auto& marker : markers_) {
398     Timespan& cur_max_step_time =
399         marker.type == StepMarkerType::kDeviceStepMarker ? max_device_step_time
400                                                          : max_host_step_time;
401     const Timespan& new_step_time = marker.span;
402     if (new_step_time.duration_ps() > cur_max_step_time.duration_ps())
403       cur_max_step_time = new_step_time;
404   }
405   // CPU-only profile.
406   if (max_device_step_time.Empty()) {
407     return max_host_step_time;
408   }
409 
410   // If the host step time includes the device step time, use the host step
411   // time. This covers the case where the device is synchronized at the end of
412   // each step.
413   if (max_host_step_time.Includes(max_device_step_time)) {
414     return max_host_step_time;
415   }
416   return max_device_step_time;
417 }
418 
DebugString() const419 std::string StepDetails::DebugString() const {
420   std::string result = "([";
421   for (int i = 0, end = markers_.size(); i < end; i++) {
422     if (i > 0) absl::StrAppend(&result, ", ");
423     absl::StrAppend(&result, PrintStepMarker(markers_[i]));
424   }
425   absl::StrAppend(&result, "], [");
426   for (int i = 0, end = events_.size(); i < end; i++) {
427     if (i > 0) absl::StrAppend(&result, ", ");
428     absl::StrAppend(&result, PrintEventTypeSpan(events_[i]));
429   }
430   return absl::StrCat(result, "])");
431 }
432 
operator ==(const StepDetails & other) const433 bool StepDetails::operator==(const StepDetails& other) const {
434   const auto& other_markers = other.Markers();
435   if (markers_.size() != other_markers.size()) return false;
436   for (uint64 i = 0; i < markers_.size(); i++) {
437     if (markers_[i] != other_markers[i]) return false;
438   }
439   const auto& other_events = other.Events();
440   if (events_.size() != other_events.size()) return false;
441   for (uint64 i = 0; i < events_.size(); i++) {
442     if (events_[i] != other_events[i]) return false;
443   }
444   return true;
445 }
446 
operator ==(const StepEvents & a,const StepEvents & b)447 bool operator==(const StepEvents& a, const StepEvents& b) {
448   if (a.size() != b.size()) return false;
449   for (const auto& id_details : a) {
450     const auto a_id = id_details.first;
451     const auto& a_details = id_details.second;
452     const auto* b_details = gtl::FindOrNull(b, a_id);
453     if (b_details == nullptr) return false;
454     if (a_details != *b_details) return false;
455   }
456   return true;
457 }
458 
ComputePrecisionStats(const StepEvents & nonoverlapped_step_events)459 PrecisionStats ComputePrecisionStats(
460     const StepEvents& nonoverlapped_step_events) {
461   int64 compute_32bit_ps = 0;
462   int64 compute_16bit_ps = 0;
463   for (const auto& id_details : nonoverlapped_step_events) {
464     for (const auto& event : id_details.second.Events()) {
465       switch (event.type) {
466         case DEVICE_COMPUTE_32:
467           compute_32bit_ps += event.span.duration_ps();
468           break;
469         case DEVICE_COMPUTE_16:
470           compute_16bit_ps += event.span.duration_ps();
471           break;
472         default:
473           break;
474       }
475     }
476   }
477   PrecisionStats precision_stats;
478   precision_stats.set_compute_32bit_ps(compute_32bit_ps);
479   precision_stats.set_compute_16bit_ps(compute_16bit_ps);
480   return precision_stats;
481 }
482 
483 }  // namespace profiler
484 }  // namespace tensorflow
485