1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17 
18 #include <vector>
19 
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/platform/env.h"
23 #include "tensorflow/core/platform/types.h"
24 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
25 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
26 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
27 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
28 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
29 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
30 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
31 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
32 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
33 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
35 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
36 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
37 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
38 #include "tensorflow/core/profiler/utils/event_span.h"
39 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
40 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
41 #include "tensorflow/core/profiler/utils/step_intersection.h"
42 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
43 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
44 #include "tensorflow/core/profiler/utils/xplane_schema.h"
45 #include "tensorflow/core/profiler/utils/xplane_utils.h"
46 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
47 
48 namespace tensorflow {
49 namespace profiler {
50 
GetDeviceCapFromXPlane(const XPlane & device_plane)51 DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
52   DeviceCapabilities cap;
53   XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_plane);
54   plane.ForEachStat([&cap](const XStatVisitor& stat) {
55     if (!stat.Type().has_value()) return;
56     switch (stat.Type().value()) {
57       case kDevCapClockRateKHz:
58         cap.set_clock_rate_in_ghz(stat.IntValue() / 1000000.0);
59         break;
60       case kDevCapCoreCount:
61         cap.set_num_cores(stat.IntValue());
62         break;
63       case kDevCapMemoryBandwidth:
64         cap.set_memory_bandwidth(stat.UintValue());  // bytes/s
65         break;
66       case kDevCapMemorySize:
67         cap.set_memory_size_in_bytes(stat.UintValue());
68         break;
69       case kDevCapComputeCapMajor:
70         cap.mutable_compute_capability()->set_major(stat.IntValue());
71         break;
72       case kDevCapComputeCapMinor:
73         cap.mutable_compute_capability()->set_minor(stat.IntValue());
74         break;
75     }
76   });
77   return cap;
78 }
79 
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)80 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
81                     double peak_hbm_bw_giga_bytes_per_second) {
82   PerfEnv result;
83   result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
84   result.set_peak_hbm_bw_giga_bytes_per_second(
85       peak_hbm_bw_giga_bytes_per_second);
86   result.set_ridge_point(peak_tera_flops_per_second * 1000 /
87                          peak_hbm_bw_giga_bytes_per_second);
88   return result;
89 }
90 
GetPerfEnvFromXPlane(const XPlane & device_plane)91 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
92   DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane);
93   return MakePerfEnv(GetFlopMaxThroughputPerSM(cap) / 1000 * cap.num_cores(),
94                      cap.memory_bandwidth() / 1e9);
95 }
96 
97 namespace {
98 
SetRunEnvironment(const XSpace & space,int32 accelerator_count,RunEnvironment * env)99 void SetRunEnvironment(const XSpace& space, int32 accelerator_count,
100                        RunEnvironment* env) {
101   // Currently, we only support profiling one host and one program.
102   env->set_host_count(1);
103   env->set_task_count(1);
104   for (const auto& hostname : space.hostnames()) {
105     std::vector<std::string> hostname_split = absl::StrSplit(hostname, ':');
106     (*env->mutable_hostnames())[hostname_split[0]] = true;
107   }
108   env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
109   env->set_device_core_count(accelerator_count);
110 }
111 
ProcessHostPlane(const XPlane * host_plane,bool use_device_step_events,const OpStatsOptions & options,OpMetricsDb * op_metrics_db,StepEvents * step_events)112 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
113                       const OpStatsOptions& options, OpMetricsDb* op_metrics_db,
114                       StepEvents* step_events) {
115   absl::flat_hash_map<int64, TfOp> tf_ops =
116       CollectTfOpsFromHostThreadsXPlane(*host_plane);
117   OpMetricsDbCombiner combiner(op_metrics_db);
118   XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
119   plane.ForEachLine([&](const XLineVisitor& line) {
120     ConsumeTfMetricsDbData(
121         ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
122     if (options.generate_step_db) {
123       CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
124                             line, use_device_step_events, *step_events),
125                         step_events);
126     }
127   });
128 }
129 
130 }  // namespace
131 
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)132 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
133                                          OpStats* op_stats) {
134   if (!space.errors().empty()) {
135     absl::flat_hash_set<std::string> unique_errors;
136     unique_errors.insert(space.errors().begin(), space.errors().end());
137     *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
138                                                           unique_errors.end()};
139   }
140   if (!space.warnings().empty()) {
141     absl::flat_hash_set<std::string> unique_warnings;
142     unique_warnings.insert(space.warnings().begin(), space.warnings().end());
143     *op_stats->mutable_diagnostics()->mutable_warnings() = {
144         unique_warnings.begin(), unique_warnings.end()};
145   }
146 }
147 
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)148 OpStats ConvertXSpaceToOpStats(const XSpace& space,
149                                const OpStatsOptions& options) {
150   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
151   std::vector<const XPlane*> device_planes =
152       FindPlanesWithPrefix(space, kGpuPlanePrefix);
153   OpStats op_stats;
154   StepEvents step_events;
155   PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
156   // Convert device planes.
157   OpMetricsDbCombiner op_metrics_db_combiner(
158       op_stats.mutable_device_op_metrics_db());
159   SetRunEnvironment(space, device_planes.size(),
160                     op_stats.mutable_run_environment());
161 
162   KernelReportMap reports;
163   absl::string_view gpu_model = "";
164 
165   // TODO(b/161942993) parallelize XPlane processing per thread.
166   for (const XPlane* device_trace : device_planes) {
167     if (options.generate_op_metrics_db) {
168       if (!op_stats.has_perf_env()) {
169         *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
170       }
171       OpMetricsDb device_op_metrics_db =
172           ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
173       op_metrics_db_combiner.Combine(device_op_metrics_db);
174     }
175     if (gpu_model.empty()) {
176       gpu_model = GpuModelName(GetDeviceCapFromXPlane(*device_trace));
177     }
178     if (options.generate_step_db) {
179       CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
180                         &step_events);
181     }
182     if (options.generate_kernel_stats_db) {
183       ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
184                                               /*on_kernel_fn=*/{}, &reports);
185     }
186   }
187 
188   if (!gpu_model.empty()) {
189     // Overwrites the device type with the more specific GPU model name.
190     op_stats.mutable_run_environment()->set_device_type(std::string(gpu_model));
191   }
192 
193   // Combine into reports.
194   if (options.generate_kernel_stats_db) {
195     CopyTopKDurationKernelReportsToDb(reports,
196                                       op_stats.mutable_kernel_stats_db());
197   }
198 
199   bool has_device = !device_planes.empty();
200   // Convert a host plane.
201   if (host_plane && options.generate_op_metrics_db) {
202     ProcessHostPlane(host_plane, has_device, options,
203                      op_stats.mutable_host_op_metrics_db(), &step_events);
204   }
205   if (options.generate_step_db) {
206     StepEvents nonoverlapped_step_events =
207         ToNonOverlappedStepEvents(step_events);
208     *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
209         has_device, options.maybe_drop_incomplete_steps,
210         nonoverlapped_step_events);
211     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
212         ComputePrecisionStats(nonoverlapped_step_events);
213   }
214 
215   CoreDetails& details =
216       (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
217   details.set_hostname(space.hostnames().empty() ? "localhost"
218                                                  : space.hostnames(0));
219   return op_stats;
220 }
221 
ConvertMultiXSpacesToCombinedOpStats(const std::vector<std::string> & xspace_paths,const OpStatsOptions & options,OpStats * combined_op_stats)222 Status ConvertMultiXSpacesToCombinedOpStats(
223     const std::vector<std::string>& xspace_paths, const OpStatsOptions& options,
224     OpStats* combined_op_stats) {
225   // A shortcut code path for a single XSpace. There is no need to merge OpStats
226   // if there is only a single XSpace.
227   if (xspace_paths.size() == 1) {
228     XSpace xspace;
229     Status status = ReadBinaryProto(Env::Default(), xspace_paths[0], &xspace);
230     if (!status.ok()) return status;
231     *combined_op_stats = ConvertXSpaceToOpStats(xspace, options);
232     return Status::OK();
233   }
234 
235   // Read multiple XSpaces and convert to multiple OpStats.
236   std::vector<OpStats> all_op_stats;
237   for (const std::string& xspace_path : xspace_paths) {
238     XSpace xspace;
239     Status status = ReadBinaryProto(Env::Default(), xspace_path, &xspace);
240     if (!status.ok()) return status;
241     all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
242   }
243 
244   // Combine OpStats.
245   std::vector<OpStatsInfo> all_op_stats_info;
246   all_op_stats_info.reserve(all_op_stats.size());
247   for (int i = 0; i < all_op_stats.size(); i++) {
248     all_op_stats_info.emplace_back(
249         &all_op_stats[i],
250         ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
251   }
252 
253   // Do not limit the maximum number of steps during the merge of OpStats.
254   StepIntersection step_intersection =
255       ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
256   CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
257 
258   return Status::OK();
259 }
260 
261 }  // namespace profiler
262 }  // namespace tensorflow
263