1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
17 #include "absl/strings/str_cat.h"
18 #include "absl/strings/str_format.h"
19 #include "tensorflow/compiler/xla/metric_table_report.h"
20 #include "tensorflow/compiler/xla/util.h"
21 #include "tensorflow/core/lib/strings/numbers.h"
22 
23 namespace xla {
24 
25 using absl::StrAppend;
26 using absl::StrAppendFormat;
27 using absl::StrCat;
28 using absl::StrFormat;
29 using tensorflow::strings::HumanReadableElapsedTime;
30 using tensorflow::strings::HumanReadableNumBytes;
31 
ToString() const32 string HumanReadableProfileBuilder::ToString() const {
33   string s;
34 
35   StrAppendFormat(&s, "Execution profile for %s: (%s @ f_nom)\n",
36                   computation_name_,
37                   HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)));
38 
39   int64 cumulative_cycles = 0;
40   auto print_op = [&](const OpInfo& op, bool is_total = false) {
41     // Skip ops with 0 optimal seconds and 0 actual cycles.  These are ops that
42     // were expected to be free and are actually free -- things like (on most
43     // backends) kParameter or kConstant HLOs.  There's no need to clutter the
44     // profile with these.
45     if (op.optimal_seconds == 0 && op.cycles == 0) {
46       return;
47     }
48 
49     string bytes_per_sec;
50     string bytes_per_cycle;
51     if (op.cycles > 0 && op.bytes_accessed >= 0) {
52       bytes_per_sec = StrCat(
53           HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)),
54           "/s");
55       double bpc = static_cast<double>(op.bytes_accessed) / op.cycles;
56       if (op.bytes_accessed > op.cycles) {
57         bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle");
58       } else {
59         bytes_per_cycle = StrFormat("%.3fB/cycle", bpc);
60       }
61     }
62 
63     double cumulative_cycles_percent = 0;
64     double cycles_percent = 0;
65     if (!is_total) {
66       cumulative_cycles += op.cycles;
67     }
68     if (total_cycles_ > 0) {
69       cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100;
70       cumulative_cycles_percent =
71           cumulative_cycles / static_cast<double>(total_cycles_) * 100;
72     }
73 
74     string cycles_percent_str;
75     if (is_total) {
76       // Leaving off the two trailing decimal points of "100.%" lets us save two
77       // columns in the output.
78       cycles_percent_str = "100.% 100Σ";
79     } else {
80       cycles_percent_str = StrFormat("%5.2f%% %2.0fΣ", cycles_percent,
81                                      cumulative_cycles_percent);
82     }
83 
84     double nsecs = op.cycles / clock_rate_ghz_;
85     StrAppendFormat(
86         &s,
87         "%15d cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: "
88         "%16s :: %s\n",
89         op.cycles, cycles_percent_str, CyclesToMicroseconds(op.cycles),
90         op.optimal_seconds < 0
91             ? ""
92             : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6),
93         op.flop_count > 0 && nsecs > 0
94             ? HumanReadableNumFlops(op.flop_count, nsecs)
95             : "",
96         op.transcendental_count > 0 && nsecs > 0
97             ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
98             : "",
99         bytes_per_sec, bytes_per_cycle, op.name);
100   };
101 
102   double optimal_seconds_sum = 0;
103   int64 total_flops = 0.;
104   int64 total_transcendentals = 0.;
105   int64 total_bytes = 0;
106   for (const auto& op : op_infos_) {
107     if (op.optimal_seconds > 0) {
108       // An op can run faster than the estimated optimum. For example, we might
109       // estimate a fusion's speed by looking at the size of its operands and
110       // result, but perhaps the fusion doesn't read the entirety of all of its
111       // inputs.  For the purposes of summing the instructions' optimal speeds,
112       // we treat the "optimum" as the smallest of either the estimated optimum
113       // and the actual speed.
114       optimal_seconds_sum +=
115           std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles));
116     }
117     total_flops += std::max(op.flop_count, int64{0});
118     total_transcendentals += std::max(op.transcendental_count, int64{0});
119     total_bytes += std::max(op.bytes_accessed, int64{0});
120   }
121 
122   VLOG(1) << "Total floating point ops: " << total_flops;
123 
124   print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]",
125             /*category=*/"", total_cycles_, total_flops, total_transcendentals,
126             total_bytes, static_cast<float>(optimal_seconds_sum)},
127            /*is_total=*/true);
128 
129   // Sort ops in decreasing order of cycles, and print them.
130   std::vector<OpInfo> sorted_ops(op_infos_);
131   absl::c_sort(sorted_ops, [](const OpInfo& a, const OpInfo& b) {
132     return a.cycles > b.cycles;
133   });
134   for (const auto& op : sorted_ops) {
135     print_op(op);
136   }
137 
138   if (total_cycles_ <= 0) {
139     StrAppend(&s, "****** 0 total cycles ******\n");
140   } else {
141     // Only show an optimal discrepancy table if at least one value was
142     // specified. Estimates are non-negative, so if the sum is greater than
143     // zero, then at least one summand was greater than zero.
144     if (optimal_seconds_sum > 0) {
145       MetricTableReport table;
146       table.SetMetricName("microseconds above estimated optimum");
147       table.SetEntryName("ops");
148       table.SetShowCategoryTable();
149       table.SetShowAllEntries();
150       float total_discrepancy_in_microseconds = 0.0f;
151       for (const auto& op : op_infos_) {
152         // Skip ops with < 0 optimal seconds.  These are ops for which we don't
153         // know the optimal time.
154         if (op.optimal_seconds < 0) {
155           continue;
156         }
157         // Also skip ops with 0 actual cycles.  These ops were free; there's no
158         // need to clutter the "above estimated optimum" table with them,
159         // because they can't be optimized further.
160         if (op.cycles == 0) {
161           continue;
162         }
163         MetricTableReport::Entry entry;
164         entry.text = op.name;
165         entry.short_text = op.short_name;
166         entry.category_text = op.category;
167         // Ignore ops that run faster than the estimated optimal here, as we do
168         // when calculating optimal_seconds_sum.
169         entry.metric = std::max(
170             0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6);
171         total_discrepancy_in_microseconds += entry.metric;
172         table.AddEntry(std::move(entry));
173       }
174       StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds));
175     }
176 
177     {
178       MetricTableReport table;
179       table.SetMetricName("microseconds");
180       table.SetEntryName("ops");
181       table.SetShowCategoryTable();
182       table.SetShowAllEntries();
183       for (const auto& op : op_infos_) {
184         // Skip ops with 0 optimal seconds and 0 actual cycles.  As in
185         // print_op(), these are uninteresting because they're expected to be
186         // free, and they were actually free.
187         if (op.cycles == 0 && op.optimal_seconds == 0) {
188           continue;
189         }
190         MetricTableReport::Entry entry;
191         entry.text = op.name;
192         entry.short_text = op.short_name;
193         entry.category_text = op.category;
194         entry.metric = CyclesToMicroseconds(op.cycles);
195         table.AddEntry(std::move(entry));
196       }
197       StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
198     }
199   }
200 
201   if (total_bytes > 0) {
202     MetricTableReport table;
203     table.SetMetricName("MiB read+written");
204     table.SetEntryName("ops");
205     table.SetShowCategoryTable();
206     for (const auto& op : op_infos_) {
207       MetricTableReport::Entry entry;
208       entry.text = op.name;
209       entry.short_text = op.short_name;
210       entry.category_text = op.category;
211       entry.metric = static_cast<double>(op.bytes_accessed) / (1 << 20);
212       table.AddEntry(std::move(entry));
213     }
214     StrAppend(&s,
215               table.MakeReport(static_cast<double>(total_bytes) / (1 << 20)));
216   }
217   return s;
218 }
219 
220 }  // namespace xla
221