1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17
18 #include <utility>
19
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/core/lib/core/status.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29
GetTimeDevName(const std::string & dev)30 std::string GetTimeDevName(const std::string& dev) {
31 if (dev.find("stream") != dev.npos) {
32 return absl::StrCat("Op execution threads: ", dev);
33 } else {
34 return absl::StrCat("Op scheduling threads: ", dev);
35 }
36 }
GetMemoryLaneName(const std::string & dev)37 std::string GetMemoryLaneName(const std::string& dev) {
38 return absl::StrCat("mem usage on:", dev);
39 }
40 } // namespace
41
CreateEvent(const string & ph,const string & category,const string & name,int64 pid,int64 tid,int64 ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43 const string& category,
44 const string& name, int64 pid,
45 int64 tid, int64 ts) {
46 Json::Value event(Json::objectValue);
47 event["ph"] = Json::Value(ph);
48 event["cat"] = Json::Value(category);
49 event["name"] = Json::Value(name);
50 event["pid"] = Json::Int64(pid);
51 event["tid"] = Json::Int64(tid);
52 event["ts"] = Json::Int64(ts);
53 return event;
54 }
55
EmitPID(const string & name,int64 pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
57 Json::Value event(Json::objectValue);
58 event["name"] = Json::Value("process_name");
59 event["ph"] = Json::Value("M");
60 event["pid"] = Json::Int64(pid);
61 Json::Value args(Json::objectValue);
62 args["name"] = Json::Value(name);
63 event["args"] = args;
64 metadata_.push_back(event);
65 }
66
EmitRegion(int64 ts,int64 duration,int64 pid,int64 tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
68 int64 tid, const string& category,
69 const string& name, Json::Value args) {
70 Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71 event["dur"] = Json::Int64(duration);
72 event["args"] = std::move(args);
73 metadata_.push_back(event);
74 }
75
EmitFlowStart(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
77 int64 pid, int64 tid, int64 flow_id) {
78 Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
79 event["id"] = Json::Int64(flow_id);
80 events_.push_back(event);
81 }
82
EmitFlowEnd(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)83 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
84 int64 tid, int64 flow_id) {
85 Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
86 event["id"] = Json::Int64(flow_id);
87 events_.push_back(event);
88 }
89
EmitCounter(const string & category,const string & name,int64 pid,int64 ts,const string & device,int64 bytes,const std::map<int64,std::vector<string>> & tensor_mem)90 void ChromeTraceFormatter::EmitCounter(
91 const string& category, const string& name, int64 pid, int64 ts,
92 const string& device, int64 bytes,
93 const std::map<int64, std::vector<string>>& tensor_mem) {
94 Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
95 Json::Value args(Json::objectValue);
96 args["Allocator Bytes in Use"] = Json::Int64(bytes);
97 event["args"] = args;
98 events_.push_back(event);
99
100 // TODO(xpan): chrome://tracing is not ideal visualization for memory.
101 // It would be great to have a customized UI for it.
102 Json::Value event2 =
103 CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
104 Json::Value args2(Json::objectValue);
105 // Need to reserve the same args for all locations.
106 for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
107 args2[absl::StrFormat("Top Allocation %02d", i)] = Json::Value("N/A");
108 }
109 int count = 0;
110 for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
111 for (const string& t : it->second) {
112 if (bytes < it->first || count >= kMaxDisplayedMemNode) {
113 break;
114 }
115 args2[absl::StrFormat("Top Allocation %02d", count)] =
116 Json::Value(absl::StrCat(it->first / 1000000.0, " MB from ", t));
117 ++count;
118 bytes -= it->first;
119 }
120 }
121 args2[std::string("Not Displayed")] =
122 Json::Value(absl::StrFormat("%.2f MB", bytes / 1000000.0));
123 event2["args"] = args2;
124 events_.push_back(event2);
125 }
126
Format()127 string ChromeTraceFormatter::Format() {
128 Json::Value trace;
129 trace["traceEvents"] = Json::Value(Json::arrayValue);
130 for (const Json::Value& v : metadata_) {
131 trace["traceEvents"].append(v);
132 }
133 for (const Json::Value& v : events_) {
134 trace["traceEvents"].append(v);
135 }
136 Json::FastWriter writer;
137 string trace_str = writer.write(trace);
138 if (trace_str.length() > 200 * 1024 * 1024) {
139 absl::FPrintF(stderr,
140 "Trace file is over 200MB. Chrome might not be able to "
141 "display it. Consider to use filters (e.g. -min_micros "
142 "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
143 }
144 return trace_str;
145 }
146
TrackNode(int64 step,const GraphNode * node)147 void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
148 if (!node->Trackable(step)) {
149 return;
150 }
151
152 Device& dev = devices_[node->node->canonical_device()];
153
154 std::map<int64, int64> allocs;
155 for (const auto& alloc : node->node->allocations(step)) {
156 allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
157 dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
158 }
159 dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
160 allocs[0] += node->node->accelerator_persistent_bytes();
161
162 int64 last = 0;
163 std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
164 for (auto it = allocs.begin(); it != allocs.end(); ++it) {
165 last += it->second;
166 aggregate_allocs[it->first] = last;
167 }
168 for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
169 if (bytes_in_use.first <= 0) continue;
170 dev.allocations[bytes_in_use.first] = bytes_in_use.second;
171 }
172 }
173
AllocateTimeNodes(GraphNode * gnode)174 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
175 if (gnode->Trackable(step_)) {
176 TrackNode(gnode);
177 const TFGraphNode* node = gnode->node;
178 for (const auto& kernel_execs : node->op_execs(step_)) {
179 const string& device = kernel_execs.first;
180
181 if (process_.find(device) == process_.end()) {
182 int64 pid = AllocatePID();
183 process_[device].reset(new Process(device, pid));
184 chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
185 }
186 Process* p = process_[device].get();
187
188 for (const auto& exec : kernel_execs.second) {
189 int64 start_micros = exec.first;
190 int64 exec_micros = exec.second;
191 // TODO(xpan): There might be start time duplication here.
192 if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
193 // TODO(xpan): Give each kernel call a unique_name.
194 tnodes_[device][start_micros].reset(
195 new TimeNode(p, gnode, start_micros, exec_micros));
196 }
197 }
198 }
199 }
200 for (GraphNode* n : gnode->show_children) {
201 AllocateTimeNodes(n);
202 }
203 }
204
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)205 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
206 for (GraphNode* gnode : gnodes) {
207 AllocateTimeNodes(gnode);
208 }
209 // To save memory, we only track cross-device (canonical device) flows.
210 for (auto& process : tnodes_) {
211 if (!IsCanonicalDevice(process.first)) continue;
212 for (auto& tn : process.second) {
213 TimeNode* tnode = tn.second.get();
214 for (GraphNode* inp : tnode->node->children) {
215 if (!inp->account || !inp->Trackable(step_)) {
216 continue;
217 }
218 for (const auto& execs : inp->node->cpu_execs(step_)) {
219 if (!IsCanonicalDevice(execs.first)) continue;
220 if (process.first == execs.first) {
221 // Not interested in flow within the same device.
222 continue;
223 }
224 for (const auto& exec : execs.second) {
225 int64 start_micros = exec.first;
226 auto cprocess = tnodes_.find(execs.first);
227 if (cprocess == tnodes_.end()) continue;
228 auto ctn = cprocess->second.find(start_micros);
229 if (ctn == cprocess->second.end()) continue;
230 ctn->second->next_tnodes.push_back(tnode);
231 }
232 }
233 }
234 }
235 }
236
237 AllocateLanes();
238 absl::FPrintF(stdout, "generating trace file.\n");
239 int64 flow_id = 1;
240 for (const auto& process : alloc_nodes_) {
241 for (const auto& lane : process.second) {
242 for (const auto& node : lane.second) {
243 TimeNode* tnode = node.second;
244
245 Json::Value args(Json::objectValue);
246 args["name"] = Json::Value(tnode->name());
247 chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
248 process.first, lane.first, "Op",
249 tnode->name(), args);
250 // Flow is a directed arrow pointing from src to dst.
251 // TODO(xpan): Disable flow to reduce json file size for now. Need
252 // to think of a better way to make flow interpretable.
253 for (TimeNode* next_tnode : node.second->next_tnodes) {
254 chrome_formatter_.EmitFlowStart(
255 tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
256 process.first, lane.first, flow_id);
257 chrome_formatter_.EmitFlowEnd(
258 tnode->name() + "_flow", next_tnode->start_micros,
259 next_tnode->process->pid, next_tnode->tid, flow_id);
260 flow_id += 1;
261 }
262 }
263 }
264 }
265 for (const auto& dev : mem_tracker_.devices()) {
266 if (IsPlacedOnCPU(dev.first)) {
267 // TODO(xpan): Maybe also support CPU allocator memory tracking.
268 continue;
269 }
270 int64 pid = AllocatePID();
271 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
272 int64 pid2 = AllocatePID();
273 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
274 pid2);
275
276 const MemoryTracker::Device& device = dev.second;
277
278 int64 max_bytes_in_use = 0;
279 int64 cur_bytes_in_use = 0;
280 int64 last_point = 0;
281 for (const auto& alloc : device.allocations) {
282 cur_bytes_in_use = alloc.second;
283 max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
284 // Do not plot too dense to reduce file size.
285 int64 ts = alloc.first;
286 if (ts - last_point < 100) continue;
287 last_point = ts;
288
289 std::map<int64, std::vector<string>> tensor_mem;
290 for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
291 const auto& tensor_alloc = tensor_alloc_it.second;
292 auto it = tensor_alloc.lower_bound(ts);
293 if (it != tensor_alloc.begin()) {
294 --it;
295 }
296 if (it->second > 0) {
297 tensor_mem[it->second].push_back(tensor_alloc_it.first);
298 }
299 }
300 chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
301 dev.first, cur_bytes_in_use, tensor_mem);
302 }
303 if (IsPlacedOnAccelerator(dev.first)) {
304 absl::FPrintF(stdout, "%s peak memory: %.2f MB\n", dev.first,
305 max_bytes_in_use / 1000000.0);
306 }
307 }
308 OutputTimeline();
309 }
310
GenerateScopeTimeline(const ScopeNode * node)311 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
312 std::set<int64> visited_depth;
313 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
314 OutputTimeline();
315 }
316
GenerateCodeTimeline(const CodeNode * node)317 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
318 std::set<int64> visited_depth;
319 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
320 OutputTimeline();
321 }
322
OutputTimeline()323 void Timeline::OutputTimeline() {
324 std::string outfile = absl::StrFormat("%s_%d", outfile_, step());
325 Status s =
326 WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
327 if (!s.ok()) {
328 absl::FPrintF(stderr, "Failed to write timeline file: %s\nError: %s\n",
329 outfile, s.ToString());
330 return;
331 }
332 absl::FPrintF(stdout,
333 "\n******************************************************\n");
334 absl::FPrintF(stdout,
335 "Timeline file is written to %s.\n"
336 "Open a Chrome browser, enter URL chrome://tracing and "
337 "load the timeline file.",
338 outfile);
339 absl::FPrintF(stdout,
340 "\n******************************************************\n");
341 fflush(stdout);
342 }
343
AllocateLanes()344 void Timeline::AllocateLanes() {
345 for (auto& process : tnodes_) {
346 Process* p = process_[process.first].get();
347 for (auto& tnode : process.second) {
348 int64 start_time = tnode.second->start_micros;
349 int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
350 int64 l = -1;
351 for (int64 i = 0, end = p->lanes.size(); i < end; ++i) {
352 const auto& lane = p->lanes[i];
353 l = i;
354 for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
355 if (cur_it->second > start_time) {
356 l = -1;
357 break;
358 }
359 if (start_time > cur_it->second) {
360 break;
361 }
362 }
363 if (l >= 0) {
364 break;
365 }
366 }
367 if (l < 0) {
368 l = p->lanes.size();
369 std::map<int64, int64> nlane;
370 nlane[start_time] = end_time;
371 p->lanes.push_back(nlane);
372 } else {
373 p->lanes[l][start_time] = end_time;
374 }
375 tnode.second->tid = l;
376 alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
377 }
378 }
379 }
380
AllocatePID()381 int64 Timeline::AllocatePID() {
382 int64 cur_pid = next_pid_;
383 next_pid_ += 1;
384 return cur_pid;
385 }
386
387 } // namespace tfprof
388 } // namespace tensorflow
389