1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
17
18 #include <algorithm>
19 #include <string>
20 #include <tuple>
21 #include <vector>
22
23 #include "absl/algorithm/container.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/numbers.h"
26 #include "absl/strings/str_split.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
31
32 namespace tensorflow {
33 namespace profiler {
34
35 namespace {
36
37 // The maximum number of Kernels displayed on Kernel Stats page.
38 const int kMaxNumOfKernels = 1000;
39
40 } // namespace
41
ParseKernelLaunchParams(absl::string_view xstat_kernel_details,KernelReport * kernel)42 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
43 KernelReport* kernel) {
44 const std::vector<absl::string_view> params =
45 absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(" \n"));
46
47 constexpr uint32 kNumDimensions = 3;
48 for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
49 kernel->add_block_dim(1);
50 kernel->add_grid_dim(1);
51 }
52
53 // Process tokens.
54 for (const auto& param : params) {
55 const std::vector<absl::string_view> key_value = absl::StrSplit(param, ':');
56 if (key_value.size() != 2) {
57 // Unrecognized token.
58 continue;
59 }
60 absl::string_view key = key_value[0];
61 absl::string_view value_str = key_value[1];
62 uint32 value = 0;
63 double pct = 0.0;
64 // Cases that consume a pair of tokens "key:value".
65 if (key == "regs" && absl::SimpleAtoi(value_str, &value)) {
66 kernel->set_registers_per_thread(value);
67 } else if (key == "static_shared" && absl::SimpleAtoi(value_str, &value)) {
68 kernel->set_static_shmem_bytes(value);
69 } else if (key == "dynamic_shared" && absl::SimpleAtoi(value_str, &value)) {
70 kernel->set_dynamic_shmem_bytes(value);
71 } else if (key == "block") {
72 const std::vector<absl::string_view>& block =
73 absl::StrSplit(value_str, ',');
74 uint32 tmp[3];
75 if (block.size() == 3 && absl::SimpleAtoi(block[0], &tmp[0]) &&
76 absl::SimpleAtoi(block[1], &tmp[1]) &&
77 absl::SimpleAtoi(block[2], &tmp[2])) {
78 std::copy_n(tmp, 3, kernel->mutable_block_dim()->begin());
79 }
80 } else if (key == "grid") {
81 const std::vector<absl::string_view>& grid =
82 absl::StrSplit(value_str, ',');
83 uint32 tmp[3];
84 if (grid.size() == 3 && absl::SimpleAtoi(grid[0], &tmp[0]) &&
85 absl::SimpleAtoi(grid[1], &tmp[1]) &&
86 absl::SimpleAtoi(grid[2], &tmp[2])) {
87 std::copy_n(tmp, 3, kernel->mutable_grid_dim()->begin());
88 }
89 } else if (key == "occ_pct" && absl::SimpleAtod(value_str, &pct)) {
90 kernel->set_occupancy_pct(pct);
91 }
92 }
93 }
94
IsKernelUsingTensorCore(absl::string_view kernel_name)95 bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
96 // Some examples: volta_h884gemm, volta_fp16_s884gemm,
97 // turing_fp16_s1688cudnn_fp16
98 bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
99 absl::StrContains(kernel_name, "1688") ||
100 absl::StrContains(kernel_name, "hmma") ||
101 absl::StrContains(kernel_name, "xmma");
102 if (possible_tensor_kernel) {
103 VLOG(3) << "Possible tensor kernel: " << kernel_name;
104 }
105
106 return (absl::StartsWith(kernel_name, "volta_i884") ||
107 absl::StartsWith(kernel_name, "volta_h884") ||
108 absl::StartsWith(kernel_name, "volta_s884") ||
109 absl::StartsWith(kernel_name, "volta_fp16_i884") ||
110 absl::StartsWith(kernel_name, "volta_fp16_h884") ||
111 absl::StartsWith(kernel_name, "volta_fp16_s884") ||
112 absl::StartsWith(kernel_name, "turing_i1688") ||
113 absl::StartsWith(kernel_name, "turing_h1688") ||
114 absl::StartsWith(kernel_name, "turing_s1688") ||
115 absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
116 absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
117 absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
118 absl::StrContains(kernel_name, "hmma") ||
119 absl::StrContains(kernel_name, "xmma"));
120 }
121
122 // This list is not exhaustive.
IsOpTensorCoreEligible(absl::string_view tf_op_name)123 bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
124 // Disable formatting to keep inline comments vertically aligned.
125 // clang-format off
126 return false
127 // Using EndsWith to match Fused operations.
128 || absl::EndsWith(tf_op_name, "Conv2D")
129 || absl::EndsWith(tf_op_name, "Conv2DBackpropFilter")
130 || absl::EndsWith(tf_op_name, "Conv2DBackpropInput")
131 || absl::EndsWith(tf_op_name, "Conv3D")
132 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNative")
133 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropFilter")
134 || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropInput")
135 // Using Contains to match V2/V3 suffixes.
136 || absl::StrContains(tf_op_name, "BatchMatMul")
137 // MatMul requires exact matching.
138 || absl::EndsWith(tf_op_name, "/MatMul")
139 || absl::EndsWith(tf_op_name, "FusedMatMul")
140 // cuDNN operations.
141 || absl::EndsWith(tf_op_name, "/CudnnRNN")
142 || absl::StrContains(tf_op_name, "CudnnRNNV")
143 || absl::StrContains(tf_op_name, "CudnnRNNForward")
144 || absl::StrContains(tf_op_name, "CudnnRNNBackprop")
145 // Special cases.
146 || absl::EndsWith(tf_op_name, "XlaDot");
147 // clang-format on
148 }
149
IsEinsumTensorCoreEligible(absl::string_view equation)150 bool IsEinsumTensorCoreEligible(absl::string_view equation) {
151 if (equation.empty()) {
152 return false;
153 }
154 const std::vector<absl::string_view> input_output =
155 absl::StrSplit(equation, "->");
156 if (input_output.size() != 2) {
157 return false;
158 }
159 const std::vector<absl::string_view> lhs_rhs =
160 absl::StrSplit(input_output[0], ',');
161 return lhs_rhs.size() == 2;
162 }
163
operator ()(const KernelReport & lhs,const KernelReport & rhs) const164 bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
165 const KernelReport& rhs) const {
166 // Disable formatting to keep vertical alignment for better readability,
167 // and make it easier to reorder columns.
168 // clang-format off
169 auto lhs_tuple = std::make_tuple(
170 lhs.name(),
171 lhs.grid_dim(0),
172 lhs.grid_dim(1),
173 lhs.grid_dim(2),
174 lhs.block_dim(0),
175 lhs.block_dim(1),
176 lhs.block_dim(2),
177 lhs.registers_per_thread(),
178 lhs.static_shmem_bytes(),
179 lhs.dynamic_shmem_bytes(),
180 lhs.is_kernel_using_tensor_core(),
181 lhs.is_op_tensor_core_eligible(),
182 lhs.op_name());
183
184 auto rhs_tuple = std::make_tuple(
185 rhs.name(),
186 rhs.grid_dim(0),
187 rhs.grid_dim(1),
188 rhs.grid_dim(2),
189 rhs.block_dim(0),
190 rhs.block_dim(1),
191 rhs.block_dim(2),
192 rhs.registers_per_thread(),
193 rhs.static_shmem_bytes(),
194 rhs.dynamic_shmem_bytes(),
195 rhs.is_kernel_using_tensor_core(),
196 rhs.is_op_tensor_core_eligible(),
197 rhs.op_name());
198 // clang-format on
199 return lhs_tuple < rhs_tuple;
200 }
201
operator ()(const KernelReport & lhs,const KernelReport & rhs) const202 bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
203 const KernelReport& rhs) const {
204 // Disable formatting to keep vertical alignment for better readability,
205 // and make it easier to reorder columns.
206 // clang-format off
207 // Put the most expensive string comparisons last.
208 return (
209 lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
210 lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
211 lhs.block_dim(0) == rhs.block_dim(0) &&
212 lhs.block_dim(1) == rhs.block_dim(1) &&
213 lhs.block_dim(2) == rhs.block_dim(2) &&
214 lhs.grid_dim(0) == rhs.grid_dim(0) &&
215 lhs.grid_dim(1) == rhs.grid_dim(1) &&
216 lhs.grid_dim(2) == rhs.grid_dim(2) &&
217 lhs.registers_per_thread() == rhs.registers_per_thread() &&
218 lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
219 lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
220 lhs.name() == rhs.name() &&
221 lhs.op_name() == rhs.op_name());
222 // clang-format on
223 }
224
SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb * kernel_stats_db)225 void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
226 auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
227 return lhs.total_duration_ns() > rhs.total_duration_ns() ||
228 (lhs.total_duration_ns() == rhs.total_duration_ns() &&
229 KernelReportLessThanComparator()(lhs, rhs));
230 };
231
232 // Sort and keep at most <kMaxNumOfKernels> kernel reports.
233 if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
234 std::partial_sort(
235 kernel_stats_db->mutable_reports()->begin(),
236 kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
237 kernel_stats_db->mutable_reports()->end(), comp);
238 kernel_stats_db->mutable_reports()->erase(
239 kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
240 kernel_stats_db->mutable_reports()->end());
241 } else {
242 std::sort(kernel_stats_db->mutable_reports()->begin(),
243 kernel_stats_db->mutable_reports()->end(), comp);
244 }
245 }
246
CopyTopKDurationKernelReportsToDb(const KernelReportMap & reports,KernelStatsDb * dst)247 void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
248 KernelStatsDb* dst) {
249 std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
250 kernels_to_sort;
251 kernels_to_sort.reserve(reports.size());
252 for (const auto& report_value : reports) {
253 kernels_to_sort.push_back(
254 std::make_pair(&report_value.first, &report_value.second));
255 }
256
257 auto comp =
258 [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
259 const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
260 return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
261 (lhs.second->total_duration_ns ==
262 rhs.second->total_duration_ns &&
263 KernelReportLessThanComparator()(*lhs.first, *rhs.first));
264 };
265
266 // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
267 if (kernels_to_sort.size() > kMaxNumOfKernels) {
268 absl::c_partial_sort(kernels_to_sort,
269 kernels_to_sort.begin() + kMaxNumOfKernels, comp);
270 } else {
271 absl::c_sort(kernels_to_sort, comp);
272 }
273
274 int copy_size =
275 std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
276 for (int i = 0; i < copy_size; i++) {
277 KernelReport* report = dst->add_reports();
278 *report = *kernels_to_sort[i].first;
279 const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
280 // Set value using KernelReportValue.
281 report->set_occurrences(kernel_value.occurrences);
282 report->set_min_duration_ns(kernel_value.min_duration_ns);
283 report->set_max_duration_ns(kernel_value.max_duration_ns);
284 report->set_total_duration_ns(kernel_value.total_duration_ns);
285 }
286 }
287
InsertOrUpdateKernelReport(const KernelReport & kernel,const KernelReportValue & value,KernelReportMap * dst)288 void InsertOrUpdateKernelReport(const KernelReport& kernel,
289 const KernelReportValue& value,
290 KernelReportMap* dst) {
291 KernelReportValue& element = (*dst)[kernel];
292 if (element.occurrences == 0) {
293 element = value;
294 } else {
295 element.total_duration_ns += value.total_duration_ns;
296 element.min_duration_ns =
297 std::min(element.min_duration_ns, value.min_duration_ns);
298 element.max_duration_ns =
299 std::max(element.max_duration_ns, value.max_duration_ns);
300 element.occurrences += 1;
301 }
302 }
303
MergeKernelReports(const KernelReportMap & reports,KernelReportMap * dst)304 void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
305 for (auto& kernel_value : reports) {
306 InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
307 }
308 }
309
GroupKernelReportsByOpName(const KernelStatsDb & kernel_stats_db)310 KernelStatsByOpName GroupKernelReportsByOpName(
311 const KernelStatsDb& kernel_stats_db) {
312 KernelStatsByOpName op_level_kernel_stats;
313 for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
314 auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
315 OpLevelKernelStats());
316 if (ret.second) {
317 // Inserted. Add a new op in <op_level_kernel_stats>.
318 OpLevelKernelStats& stats = ret.first->second;
319 stats.is_op_tensor_core_eligible =
320 kernel_report.is_op_tensor_core_eligible();
321 stats.total_duration_ns += kernel_report.total_duration_ns();
322 if (kernel_report.is_kernel_using_tensor_core()) {
323 stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
324 }
325 } else {
326 // Not inserted. Aggregate kernel stats to op level.
327 OpLevelKernelStats& stats = ret.first->second;
328 // Verifies operations with the same name have the same TensorCore
329 // eligibility.
330 DCHECK_EQ(stats.is_op_tensor_core_eligible,
331 kernel_report.is_op_tensor_core_eligible());
332 stats.total_duration_ns += kernel_report.total_duration_ns();
333 if (kernel_report.is_kernel_using_tensor_core()) {
334 stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
335 }
336 }
337 }
338 return op_level_kernel_stats;
339 }
340
341 } // namespace profiler
342 } // namespace tensorflow
343