1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <tuple>
21 #include <vector>
22 
23 #include "absl/algorithm/container.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/numbers.h"
26 #include "absl/strings/str_split.h"
27 #include "absl/strings/string_view.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
31 
32 namespace tensorflow {
33 namespace profiler {
34 
35 namespace {
36 
37 // The maximum number of Kernels displayed on Kernel Stats page.
38 const int kMaxNumOfKernels = 1000;
39 
40 }  // namespace
41 
ParseKernelLaunchParams(absl::string_view xstat_kernel_details,KernelReport * kernel)42 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
43                              KernelReport* kernel) {
44   const std::vector<absl::string_view> params =
45       absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(" \n"));
46 
47   constexpr uint32 kNumDimensions = 3;
48   for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
49     kernel->add_block_dim(1);
50     kernel->add_grid_dim(1);
51   }
52 
53   // Process tokens.
54   for (const auto& param : params) {
55     const std::vector<absl::string_view> key_value = absl::StrSplit(param, ':');
56     if (key_value.size() != 2) {
57       // Unrecognized token.
58       continue;
59     }
60     absl::string_view key = key_value[0];
61     absl::string_view value_str = key_value[1];
62     uint32 value = 0;
63     double pct = 0.0;
64     // Cases that consume a pair of tokens "key:value".
65     if (key == "regs" && absl::SimpleAtoi(value_str, &value)) {
66       kernel->set_registers_per_thread(value);
67     } else if (key == "static_shared" && absl::SimpleAtoi(value_str, &value)) {
68       kernel->set_static_shmem_bytes(value);
69     } else if (key == "dynamic_shared" && absl::SimpleAtoi(value_str, &value)) {
70       kernel->set_dynamic_shmem_bytes(value);
71     } else if (key == "block") {
72       const std::vector<absl::string_view>& block =
73           absl::StrSplit(value_str, ',');
74       uint32 tmp[3];
75       if (block.size() == 3 && absl::SimpleAtoi(block[0], &tmp[0]) &&
76           absl::SimpleAtoi(block[1], &tmp[1]) &&
77           absl::SimpleAtoi(block[2], &tmp[2])) {
78         std::copy_n(tmp, 3, kernel->mutable_block_dim()->begin());
79       }
80     } else if (key == "grid") {
81       const std::vector<absl::string_view>& grid =
82           absl::StrSplit(value_str, ',');
83       uint32 tmp[3];
84       if (grid.size() == 3 && absl::SimpleAtoi(grid[0], &tmp[0]) &&
85           absl::SimpleAtoi(grid[1], &tmp[1]) &&
86           absl::SimpleAtoi(grid[2], &tmp[2])) {
87         std::copy_n(tmp, 3, kernel->mutable_grid_dim()->begin());
88       }
89     } else if (key == "occ_pct" && absl::SimpleAtod(value_str, &pct)) {
90       kernel->set_occupancy_pct(pct);
91     }
92   }
93 }
94 
IsKernelUsingTensorCore(absl::string_view kernel_name)95 bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
96   // Some examples: volta_h884gemm, volta_fp16_s884gemm,
97   // turing_fp16_s1688cudnn_fp16
98   bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
99                                 absl::StrContains(kernel_name, "1688") ||
100                                 absl::StrContains(kernel_name, "hmma") ||
101                                 absl::StrContains(kernel_name, "xmma");
102   if (possible_tensor_kernel) {
103     VLOG(3) << "Possible tensor kernel: " << kernel_name;
104   }
105 
106   return (absl::StartsWith(kernel_name, "volta_i884") ||
107           absl::StartsWith(kernel_name, "volta_h884") ||
108           absl::StartsWith(kernel_name, "volta_s884") ||
109           absl::StartsWith(kernel_name, "volta_fp16_i884") ||
110           absl::StartsWith(kernel_name, "volta_fp16_h884") ||
111           absl::StartsWith(kernel_name, "volta_fp16_s884") ||
112           absl::StartsWith(kernel_name, "turing_i1688") ||
113           absl::StartsWith(kernel_name, "turing_h1688") ||
114           absl::StartsWith(kernel_name, "turing_s1688") ||
115           absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
116           absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
117           absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
118           absl::StrContains(kernel_name, "hmma") ||
119           absl::StrContains(kernel_name, "xmma"));
120 }
121 
122 // This list is not exhaustive.
IsOpTensorCoreEligible(absl::string_view tf_op_name)123 bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
124   // Disable formatting to keep inline comments vertically aligned.
125   // clang-format off
126   return false
127       // Using EndsWith to match Fused operations.
128       || absl::EndsWith(tf_op_name, "Conv2D")
129       || absl::EndsWith(tf_op_name, "Conv2DBackpropFilter")
130       || absl::EndsWith(tf_op_name, "Conv2DBackpropInput")
131       || absl::EndsWith(tf_op_name, "Conv3D")
132       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNative")
133       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropFilter")
134       || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropInput")
135       // Using Contains to match V2/V3 suffixes.
136       || absl::StrContains(tf_op_name, "BatchMatMul")
137       // MatMul requires exact matching.
138       || absl::EndsWith(tf_op_name, "/MatMul")
139       || absl::EndsWith(tf_op_name, "FusedMatMul")
140       // cuDNN operations.
141       || absl::EndsWith(tf_op_name, "/CudnnRNN")
142       || absl::StrContains(tf_op_name, "CudnnRNNV")
143       || absl::StrContains(tf_op_name, "CudnnRNNForward")
144       || absl::StrContains(tf_op_name, "CudnnRNNBackprop")
145       // Special cases.
146       || absl::EndsWith(tf_op_name, "XlaDot");
147   // clang-format on
148 }
149 
IsEinsumTensorCoreEligible(absl::string_view equation)150 bool IsEinsumTensorCoreEligible(absl::string_view equation) {
151   if (equation.empty()) {
152     return false;
153   }
154   const std::vector<absl::string_view> input_output =
155       absl::StrSplit(equation, "->");
156   if (input_output.size() != 2) {
157     return false;
158   }
159   const std::vector<absl::string_view> lhs_rhs =
160       absl::StrSplit(input_output[0], ',');
161   return lhs_rhs.size() == 2;
162 }
163 
operator ()(const KernelReport & lhs,const KernelReport & rhs) const164 bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
165                                                 const KernelReport& rhs) const {
166   // Disable formatting to keep vertical alignment for better readability,
167   // and make it easier to reorder columns.
168   // clang-format off
169   auto lhs_tuple = std::make_tuple(
170       lhs.name(),
171       lhs.grid_dim(0),
172       lhs.grid_dim(1),
173       lhs.grid_dim(2),
174       lhs.block_dim(0),
175       lhs.block_dim(1),
176       lhs.block_dim(2),
177       lhs.registers_per_thread(),
178       lhs.static_shmem_bytes(),
179       lhs.dynamic_shmem_bytes(),
180       lhs.is_kernel_using_tensor_core(),
181       lhs.is_op_tensor_core_eligible(),
182       lhs.op_name());
183 
184   auto rhs_tuple = std::make_tuple(
185       rhs.name(),
186       rhs.grid_dim(0),
187       rhs.grid_dim(1),
188       rhs.grid_dim(2),
189       rhs.block_dim(0),
190       rhs.block_dim(1),
191       rhs.block_dim(2),
192       rhs.registers_per_thread(),
193       rhs.static_shmem_bytes(),
194       rhs.dynamic_shmem_bytes(),
195       rhs.is_kernel_using_tensor_core(),
196       rhs.is_op_tensor_core_eligible(),
197       rhs.op_name());
198   // clang-format on
199   return lhs_tuple < rhs_tuple;
200 }
201 
operator ()(const KernelReport & lhs,const KernelReport & rhs) const202 bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
203                                                const KernelReport& rhs) const {
204   // Disable formatting to keep vertical alignment for better readability,
205   // and make it easier to reorder columns.
206   // clang-format off
207   // Put the most expensive string comparisons last.
208   return (
209       lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
210       lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
211       lhs.block_dim(0) == rhs.block_dim(0) &&
212       lhs.block_dim(1) == rhs.block_dim(1) &&
213       lhs.block_dim(2) == rhs.block_dim(2) &&
214       lhs.grid_dim(0) == rhs.grid_dim(0) &&
215       lhs.grid_dim(1) == rhs.grid_dim(1) &&
216       lhs.grid_dim(2) == rhs.grid_dim(2) &&
217       lhs.registers_per_thread() == rhs.registers_per_thread() &&
218       lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
219       lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
220       lhs.name() == rhs.name() &&
221       lhs.op_name() == rhs.op_name());
222   // clang-format on
223 }
224 
SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb * kernel_stats_db)225 void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
226   auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
227     return lhs.total_duration_ns() > rhs.total_duration_ns() ||
228            (lhs.total_duration_ns() == rhs.total_duration_ns() &&
229             KernelReportLessThanComparator()(lhs, rhs));
230   };
231 
232   // Sort and keep at most <kMaxNumOfKernels> kernel reports.
233   if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
234     std::partial_sort(
235         kernel_stats_db->mutable_reports()->begin(),
236         kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
237         kernel_stats_db->mutable_reports()->end(), comp);
238     kernel_stats_db->mutable_reports()->erase(
239         kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
240         kernel_stats_db->mutable_reports()->end());
241   } else {
242     std::sort(kernel_stats_db->mutable_reports()->begin(),
243               kernel_stats_db->mutable_reports()->end(), comp);
244   }
245 }
246 
CopyTopKDurationKernelReportsToDb(const KernelReportMap & reports,KernelStatsDb * dst)247 void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
248                                        KernelStatsDb* dst) {
249   std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
250       kernels_to_sort;
251   kernels_to_sort.reserve(reports.size());
252   for (const auto& report_value : reports) {
253     kernels_to_sort.push_back(
254         std::make_pair(&report_value.first, &report_value.second));
255   }
256 
257   auto comp =
258       [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
259          const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
260         return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
261                (lhs.second->total_duration_ns ==
262                     rhs.second->total_duration_ns &&
263                 KernelReportLessThanComparator()(*lhs.first, *rhs.first));
264       };
265 
266   // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
267   if (kernels_to_sort.size() > kMaxNumOfKernels) {
268     absl::c_partial_sort(kernels_to_sort,
269                          kernels_to_sort.begin() + kMaxNumOfKernels, comp);
270   } else {
271     absl::c_sort(kernels_to_sort, comp);
272   }
273 
274   int copy_size =
275       std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
276   for (int i = 0; i < copy_size; i++) {
277     KernelReport* report = dst->add_reports();
278     *report = *kernels_to_sort[i].first;
279     const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
280     // Set value using KernelReportValue.
281     report->set_occurrences(kernel_value.occurrences);
282     report->set_min_duration_ns(kernel_value.min_duration_ns);
283     report->set_max_duration_ns(kernel_value.max_duration_ns);
284     report->set_total_duration_ns(kernel_value.total_duration_ns);
285   }
286 }
287 
InsertOrUpdateKernelReport(const KernelReport & kernel,const KernelReportValue & value,KernelReportMap * dst)288 void InsertOrUpdateKernelReport(const KernelReport& kernel,
289                                 const KernelReportValue& value,
290                                 KernelReportMap* dst) {
291   KernelReportValue& element = (*dst)[kernel];
292   if (element.occurrences == 0) {
293     element = value;
294   } else {
295     element.total_duration_ns += value.total_duration_ns;
296     element.min_duration_ns =
297         std::min(element.min_duration_ns, value.min_duration_ns);
298     element.max_duration_ns =
299         std::max(element.max_duration_ns, value.max_duration_ns);
300     element.occurrences += 1;
301   }
302 }
303 
MergeKernelReports(const KernelReportMap & reports,KernelReportMap * dst)304 void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
305   for (auto& kernel_value : reports) {
306     InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
307   }
308 }
309 
GroupKernelReportsByOpName(const KernelStatsDb & kernel_stats_db)310 KernelStatsByOpName GroupKernelReportsByOpName(
311     const KernelStatsDb& kernel_stats_db) {
312   KernelStatsByOpName op_level_kernel_stats;
313   for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
314     auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
315                                              OpLevelKernelStats());
316     if (ret.second) {
317       // Inserted. Add a new op in <op_level_kernel_stats>.
318       OpLevelKernelStats& stats = ret.first->second;
319       stats.is_op_tensor_core_eligible =
320           kernel_report.is_op_tensor_core_eligible();
321       stats.total_duration_ns += kernel_report.total_duration_ns();
322       if (kernel_report.is_kernel_using_tensor_core()) {
323         stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
324       }
325     } else {
326       // Not inserted. Aggregate kernel stats to op level.
327       OpLevelKernelStats& stats = ret.first->second;
328       // Verifies operations with the same name have the same TensorCore
329       // eligibility.
330       DCHECK_EQ(stats.is_op_tensor_core_eligible,
331                 kernel_report.is_op_tensor_core_eligible());
332       stats.total_duration_ns += kernel_report.total_duration_ns();
333       if (kernel_report.is_kernel_using_tensor_core()) {
334         stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
335       }
336     }
337   }
338   return op_level_kernel_stats;
339 }
340 
341 }  // namespace profiler
342 }  // namespace tensorflow
343