1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17
18 #include <vector>
19
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/platform/env.h"
23 #include "tensorflow/core/platform/types.h"
24 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
25 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
26 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
27 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
28 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
29 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
30 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
31 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
32 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
33 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
35 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
36 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
37 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
38 #include "tensorflow/core/profiler/utils/event_span.h"
39 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
40 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
41 #include "tensorflow/core/profiler/utils/step_intersection.h"
42 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
43 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
44 #include "tensorflow/core/profiler/utils/xplane_schema.h"
45 #include "tensorflow/core/profiler/utils/xplane_utils.h"
46 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
47
48 namespace tensorflow {
49 namespace profiler {
50
GetDeviceCapFromXPlane(const XPlane & device_plane)51 DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
52 DeviceCapabilities cap;
53 XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_plane);
54 plane.ForEachStat([&cap](const XStatVisitor& stat) {
55 if (!stat.Type().has_value()) return;
56 switch (stat.Type().value()) {
57 case kDevCapClockRateKHz:
58 cap.set_clock_rate_in_ghz(stat.IntValue() / 1000000.0);
59 break;
60 case kDevCapCoreCount:
61 cap.set_num_cores(stat.IntValue());
62 break;
63 case kDevCapMemoryBandwidth:
64 cap.set_memory_bandwidth(stat.UintValue()); // bytes/s
65 break;
66 case kDevCapMemorySize:
67 cap.set_memory_size_in_bytes(stat.UintValue());
68 break;
69 case kDevCapComputeCapMajor:
70 cap.mutable_compute_capability()->set_major(stat.IntValue());
71 break;
72 case kDevCapComputeCapMinor:
73 cap.mutable_compute_capability()->set_minor(stat.IntValue());
74 break;
75 }
76 });
77 return cap;
78 }
79
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)80 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
81 double peak_hbm_bw_giga_bytes_per_second) {
82 PerfEnv result;
83 result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
84 result.set_peak_hbm_bw_giga_bytes_per_second(
85 peak_hbm_bw_giga_bytes_per_second);
86 result.set_ridge_point(peak_tera_flops_per_second * 1000 /
87 peak_hbm_bw_giga_bytes_per_second);
88 return result;
89 }
90
GetPerfEnvFromXPlane(const XPlane & device_plane)91 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
92 DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane);
93 return MakePerfEnv(GetFlopMaxThroughputPerSM(cap) / 1000 * cap.num_cores(),
94 cap.memory_bandwidth() / 1e9);
95 }
96
97 namespace {
98
SetRunEnvironment(const XSpace & space,int32 accelerator_count,RunEnvironment * env)99 void SetRunEnvironment(const XSpace& space, int32 accelerator_count,
100 RunEnvironment* env) {
101 // Currently, we only support profiling one host and one program.
102 env->set_host_count(1);
103 env->set_task_count(1);
104 for (const auto& hostname : space.hostnames()) {
105 std::vector<std::string> hostname_split = absl::StrSplit(hostname, ':');
106 (*env->mutable_hostnames())[hostname_split[0]] = true;
107 }
108 env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
109 env->set_device_core_count(accelerator_count);
110 }
111
ProcessHostPlane(const XPlane * host_plane,bool use_device_step_events,const OpStatsOptions & options,OpMetricsDb * op_metrics_db,StepEvents * step_events)112 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
113 const OpStatsOptions& options, OpMetricsDb* op_metrics_db,
114 StepEvents* step_events) {
115 absl::flat_hash_map<int64, TfOp> tf_ops =
116 CollectTfOpsFromHostThreadsXPlane(*host_plane);
117 OpMetricsDbCombiner combiner(op_metrics_db);
118 XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
119 plane.ForEachLine([&](const XLineVisitor& line) {
120 ConsumeTfMetricsDbData(
121 ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
122 if (options.generate_step_db) {
123 CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
124 line, use_device_step_events, *step_events),
125 step_events);
126 }
127 });
128 }
129
130 } // namespace
131
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)132 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
133 OpStats* op_stats) {
134 if (!space.errors().empty()) {
135 absl::flat_hash_set<std::string> unique_errors;
136 unique_errors.insert(space.errors().begin(), space.errors().end());
137 *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
138 unique_errors.end()};
139 }
140 if (!space.warnings().empty()) {
141 absl::flat_hash_set<std::string> unique_warnings;
142 unique_warnings.insert(space.warnings().begin(), space.warnings().end());
143 *op_stats->mutable_diagnostics()->mutable_warnings() = {
144 unique_warnings.begin(), unique_warnings.end()};
145 }
146 }
147
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)148 OpStats ConvertXSpaceToOpStats(const XSpace& space,
149 const OpStatsOptions& options) {
150 const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
151 std::vector<const XPlane*> device_planes =
152 FindPlanesWithPrefix(space, kGpuPlanePrefix);
153 OpStats op_stats;
154 StepEvents step_events;
155 PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
156 // Convert device planes.
157 OpMetricsDbCombiner op_metrics_db_combiner(
158 op_stats.mutable_device_op_metrics_db());
159 SetRunEnvironment(space, device_planes.size(),
160 op_stats.mutable_run_environment());
161
162 KernelReportMap reports;
163 absl::string_view gpu_model = "";
164
165 // TODO(b/161942993) parallelize XPlane processing per thread.
166 for (const XPlane* device_trace : device_planes) {
167 if (options.generate_op_metrics_db) {
168 if (!op_stats.has_perf_env()) {
169 *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
170 }
171 OpMetricsDb device_op_metrics_db =
172 ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
173 op_metrics_db_combiner.Combine(device_op_metrics_db);
174 }
175 if (gpu_model.empty()) {
176 gpu_model = GpuModelName(GetDeviceCapFromXPlane(*device_trace));
177 }
178 if (options.generate_step_db) {
179 CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
180 &step_events);
181 }
182 if (options.generate_kernel_stats_db) {
183 ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
184 /*on_kernel_fn=*/{}, &reports);
185 }
186 }
187
188 if (!gpu_model.empty()) {
189 // Overwrites the device type with the more specific GPU model name.
190 op_stats.mutable_run_environment()->set_device_type(std::string(gpu_model));
191 }
192
193 // Combine into reports.
194 if (options.generate_kernel_stats_db) {
195 CopyTopKDurationKernelReportsToDb(reports,
196 op_stats.mutable_kernel_stats_db());
197 }
198
199 bool has_device = !device_planes.empty();
200 // Convert a host plane.
201 if (host_plane && options.generate_op_metrics_db) {
202 ProcessHostPlane(host_plane, has_device, options,
203 op_stats.mutable_host_op_metrics_db(), &step_events);
204 }
205 if (options.generate_step_db) {
206 StepEvents nonoverlapped_step_events =
207 ToNonOverlappedStepEvents(step_events);
208 *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
209 has_device, options.maybe_drop_incomplete_steps,
210 nonoverlapped_step_events);
211 *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
212 ComputePrecisionStats(nonoverlapped_step_events);
213 }
214
215 CoreDetails& details =
216 (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
217 details.set_hostname(space.hostnames().empty() ? "localhost"
218 : space.hostnames(0));
219 return op_stats;
220 }
221
ConvertMultiXSpacesToCombinedOpStats(const std::vector<std::string> & xspace_paths,const OpStatsOptions & options,OpStats * combined_op_stats)222 Status ConvertMultiXSpacesToCombinedOpStats(
223 const std::vector<std::string>& xspace_paths, const OpStatsOptions& options,
224 OpStats* combined_op_stats) {
225 // A shortcut code path for a single XSpace. There is no need to merge OpStats
226 // if there is only a single XSpace.
227 if (xspace_paths.size() == 1) {
228 XSpace xspace;
229 Status status = ReadBinaryProto(Env::Default(), xspace_paths[0], &xspace);
230 if (!status.ok()) return status;
231 *combined_op_stats = ConvertXSpaceToOpStats(xspace, options);
232 return Status::OK();
233 }
234
235 // Read multiple XSpaces and convert to multiple OpStats.
236 std::vector<OpStats> all_op_stats;
237 for (const std::string& xspace_path : xspace_paths) {
238 XSpace xspace;
239 Status status = ReadBinaryProto(Env::Default(), xspace_path, &xspace);
240 if (!status.ok()) return status;
241 all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
242 }
243
244 // Combine OpStats.
245 std::vector<OpStatsInfo> all_op_stats_info;
246 all_op_stats_info.reserve(all_op_stats.size());
247 for (int i = 0; i < all_op_stats.size(); i++) {
248 all_op_stats_info.emplace_back(
249 &all_op_stats[i],
250 ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
251 }
252
253 // Do not limit the maximum number of steps during the merge of OpStats.
254 StepIntersection step_intersection =
255 ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
256 CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
257
258 return Status::OK();
259 }
260
261 } // namespace profiler
262 } // namespace tensorflow
263