1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
16 
17 #include <sstream>
18 #include <utility>
19 #include <vector>
20 
21 #include "google/protobuf/any.pb.h"
22 #include "absl/algorithm/container.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "tensorflow/core/lib/gtl/map_util.h"
25 #include "tensorflow/core/platform/logging.h"
26 #include "tensorflow/core/platform/types.h"
27 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
28 #include "tensorflow/core/profiler/utils/event_span.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30 
31 namespace tensorflow {
32 namespace profiler {
33 
34 // Local core id should start from 1.
35 const uint32 kDefaultGpuLocalCoreId = 1;
36 
37 namespace {
38 
39 // Converts from StepDetails to StepInfoResult.
ConvertStepDetailsToStepInfo(bool has_device,int64 step_num,const StepDetails & step_details)40 StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64 step_num,
41                                             const StepDetails& step_details) {
42   GenericStepBreakdown generic;
43   Timespan step_time = step_details.StepTime();
44   auto& type_ps = *(generic.mutable_type_ps());
45   uint64 total_event_duration = 0;
46   for (const auto& event : step_details.Events()) {
47     // Ignore event duration outside the step marker.
48     uint64 event_duration = step_time.OverlappedDurationPs(event.span);
49     type_ps[event.type] += event_duration;
50     total_event_duration += event_duration;
51   }
52   if (total_event_duration < step_time.duration_ps()) {
53     // Some time in the step is not associated with any event. Classify them as
54     // "unknown time".
55     type_ps[UNKNOWN_TIME] += step_time.duration_ps() - total_event_duration;
56   }
57   // Determines if this particular step is a well-formed one.
58   bool well_formed_step = has_device ? (type_ps.contains(DEVICE_COMPUTE_16) ||
59                                         type_ps.contains(DEVICE_COMPUTE_32))
60                                      : type_ps.contains(HOST_COMPUTE);
61   StepInfoResult step_info;
62   step_info.mutable_step_breakdown()->PackFrom(generic);
63   if (well_formed_step) {
64     step_info.set_step_num(step_num);
65     step_info.set_begin_ps(step_time.begin_ps());
66     step_info.set_duration_ps(step_time.duration_ps());
67   } else {
68     // For a non-well-formed step, sets its duration to 0 so that it will be
69     // ignored by the caller of this function.
70     step_info.set_duration_ps(0);
71   }
72   return step_info;
73 }
74 
DebugGenericStepBreakdown(const GenericStepBreakdown & generic)75 string DebugGenericStepBreakdown(const GenericStepBreakdown& generic) {
76   std::ostringstream out;
77   uint64 total_ps = 0;
78   const auto& type_ps_map = generic.type_ps();
79   for (const auto& type_ps : type_ps_map) {
80     total_ps += type_ps.second;
81   }
82   out << "Total ps = " << total_ps << std::endl;
83   for (int type = LAST_EVENT_TYPE; type >= 0; --type) {
84     const auto* ps = gtl::FindOrNull(type_ps_map, type);
85     if (ps == nullptr) continue;
86     double percent = (*ps * 100.0) / total_ps;
87     auto event_type = static_cast<EventType>(type);
88     out << PrintEventType(event_type) << ": " << percent << "%"
89         << ", ps = " << *ps << std::endl;
90   }
91   return out.str();
92 }
93 
DebugStepInfo(const StepInfoResult & step_info)94 string DebugStepInfo(const StepInfoResult& step_info) {
95   std::ostringstream out;
96   out << "step_num=" << step_info.step_num()
97       << ", duration_ps=" << step_info.duration_ps()
98       << ", begin_ps=" << step_info.begin_ps() << std::endl;
99   GenericStepBreakdown generic;
100   if (step_info.step_breakdown().UnpackTo(&generic)) {
101     out << "Generic step breakdown:" << std::endl;
102     out << DebugGenericStepBreakdown(generic) << std::endl;
103   } else {
104     out << step_info.step_breakdown().DebugString() << std::endl;
105   }
106   return out.str();
107 }
108 
109 }  // namespace
110 
ConvertStepEventsToStepDb(bool has_device,bool maybe_drop_incomplete_steps,const StepEvents & nonoverlapped_step_events)111 StepDatabaseResult ConvertStepEventsToStepDb(
112     bool has_device, bool maybe_drop_incomplete_steps,
113     const StepEvents& nonoverlapped_step_events) {
114   StepDatabaseResult step_db;
115   // Gets sorted step numbers.
116   std::vector<int64> step_numbers;
117   step_numbers.reserve(nonoverlapped_step_events.size());
118   for (const auto& step_events : nonoverlapped_step_events) {
119     step_numbers.push_back(step_events.first);
120   }
121   absl::c_sort(step_numbers);
122   for (const auto& step : step_numbers) {
123     const auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
124     if (step_details == nullptr) continue;
125     StepInfoResult step_info =
126         ConvertStepDetailsToStepInfo(has_device, step, *step_details);
127     if (step_info.duration_ps() == 0)
128       continue;  // Do not include non-well-formed steps.
129     PerCoreStepInfo per_core_step_info;
130     per_core_step_info.set_step_num(step);
131     // When we generated StepEvents, we already put events from all device
132     // cores and cpu threads on this host into a single event stream, therefore
133     // we can't separate them anymore. Simply assigns all events to Core-0.
134     (*per_core_step_info.mutable_step_info_per_core())[kDefaultGpuLocalCoreId] =
135         std::move(step_info);
136     VLOG(2) << std::endl
137             << "step_id: " << step << ", step_info:" << std::endl
138             << DebugStepInfo((
139                    *per_core_step_info
140                         .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
141     // Populates the collective ops information.
142     auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
143     for (const auto& it : step_details->Collectives()) {
144       collectives[it.first] = it.second;
145     }
146     // Populates the device transfer stats for this step.
147     auto& device_memory_transfers =
148         *per_core_step_info.mutable_device_memory_transfers();
149     for (const auto& dma : step_details->DeviceMemoryTransfers()) {
150       *device_memory_transfers.Add() = dma;
151     }
152     // The remaining fields in PerCoreStepInfo are not filled.
153     *step_db.add_step_sequence() = per_core_step_info;
154   }
155 
156   // If we are using sampling mode and we get enough steps, we would like to
157   // drop the incomplete steps at the beginning and the end.
158   // (Sometimes CUTPI instrumentation will prolong the first step too).
159   int kDropIncomplteteStepThreshold = 5;
160   if (maybe_drop_incomplete_steps &&
161       step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
162     step_db.mutable_step_sequence()->erase(
163         step_db.mutable_step_sequence()->begin());
164     step_db.mutable_step_sequence()->RemoveLast();
165   }
166   return step_db;
167 }
168 
169 }  // namespace profiler
170 }  // namespace tensorflow
171