1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17
18 #include <utility>
19
20 #include "tensorflow/core/lib/core/status.h"
21 #include "tensorflow/core/lib/strings/str_util.h"
22 #include "tensorflow/core/lib/strings/stringprintf.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29
GetTimeDevName(const string & dev)30 string GetTimeDevName(const string& dev) {
31 if (dev.find("stream") != dev.npos) {
32 return strings::StrCat("Op execution threads: ", dev);
33 } else {
34 return strings::StrCat("Op scheduling threads: ", dev);
35 }
36 }
GetMemoryLaneName(const string & dev)37 string GetMemoryLaneName(const string& dev) {
38 return strings::StrCat("mem usage on:", dev);
39 }
40 } // namespace
41
CreateEvent(const string & ph,const string & category,const string & name,int64 pid,int64 tid,int64 ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43 const string& category,
44 const string& name, int64 pid,
45 int64 tid, int64 ts) {
46 Json::Value event(Json::objectValue);
47 event["ph"] = Json::Value(ph);
48 event["cat"] = Json::Value(category);
49 event["name"] = Json::Value(name);
50 event["pid"] = Json::Int64(pid);
51 event["tid"] = Json::Int64(tid);
52 event["ts"] = Json::Int64(ts);
53 return event;
54 }
55
EmitPID(const string & name,int64 pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
57 Json::Value event(Json::objectValue);
58 event["name"] = Json::Value("process_name");
59 event["ph"] = Json::Value("M");
60 event["pid"] = Json::Int64(pid);
61 Json::Value args(Json::objectValue);
62 args["name"] = Json::Value(name);
63 event["args"] = args;
64 metadata_.push_back(event);
65 }
66
EmitRegion(int64 ts,int64 duration,int64 pid,int64 tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
68 int64 tid, const string& category,
69 const string& name, Json::Value args) {
70 Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71 event["dur"] = Json::Int64(duration);
72 event["args"] = std::move(args);
73 metadata_.push_back(event);
74 }
75
EmitFlowStart(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
77 int64 pid, int64 tid, int64 flow_id) {
78 Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
79 event["id"] = Json::Int64(flow_id);
80 events_.push_back(event);
81 }
82
EmitFlowEnd(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)83 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
84 int64 tid, int64 flow_id) {
85 Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
86 event["id"] = Json::Int64(flow_id);
87 events_.push_back(event);
88 }
89
EmitCounter(const string & category,const string & name,int64 pid,int64 ts,const string & device,int64 bytes,const std::map<int64,std::vector<string>> & tensor_mem)90 void ChromeTraceFormatter::EmitCounter(
91 const string& category, const string& name, int64 pid, int64 ts,
92 const string& device, int64 bytes,
93 const std::map<int64, std::vector<string>>& tensor_mem) {
94 Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
95 Json::Value args(Json::objectValue);
96 args["Allocator Bytes in Use"] = Json::Int64(bytes);
97 event["args"] = args;
98 events_.push_back(event);
99
100 // TODO(xpan): chrome://tracing is not ideal visualization for memory.
101 // It would be great to have a customized UI for it.
102 Json::Value event2 =
103 CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
104 Json::Value args2(Json::objectValue);
105 // Need to reserve the same args for all locations.
106 for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
107 args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A");
108 }
109 int count = 0;
110 for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
111 for (const string& t : it->second) {
112 if (bytes < it->first || count >= kMaxDisplayedMemNode) {
113 break;
114 }
115 args2[strings::Printf("Top Allocation %02d", count)] =
116 Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t));
117 ++count;
118 bytes -= it->first;
119 }
120 }
121 args2[strings::StrCat("Not Displayed")] =
122 Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0));
123 event2["args"] = args2;
124 events_.push_back(event2);
125 }
126
Format()127 string ChromeTraceFormatter::Format() {
128 Json::Value trace;
129 trace["traceEvents"] = Json::Value(Json::arrayValue);
130 for (const Json::Value& v : metadata_) {
131 trace["traceEvents"].append(v);
132 }
133 for (const Json::Value& v : events_) {
134 trace["traceEvents"].append(v);
135 }
136 Json::FastWriter writer;
137 string trace_str = writer.write(trace);
138 if (trace_str.length() > 200 * 1024 * 1024) {
139 fprintf(stderr,
140 "Trace file is over 200MB. Chrome might not be able to "
141 "display it. Consider to use filters (e.g. -min_micros "
142 "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
143 }
144 return trace_str;
145 }
146
TrackNode(int64 step,const GraphNode * node)147 void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
148 if (!node->Trackable(step)) {
149 return;
150 }
151
152 Device& dev = devices_[node->node->canonical_device()];
153
154 std::map<int64, int64> allocs;
155 for (const auto& alloc : node->node->allocations(step)) {
156 allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
157 dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
158 }
159 dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
160 allocs[0] += node->node->accelerator_persistent_bytes();
161
162 int64 last = 0;
163 std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
164 for (auto it = allocs.begin(); it != allocs.end(); ++it) {
165 last += it->second;
166 aggregate_allocs[it->first] = last;
167 }
168 for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
169 if (bytes_in_use.first <= 0) continue;
170 dev.allocations[bytes_in_use.first] = bytes_in_use.second;
171 }
172 }
173
AllocateTimeNodes(GraphNode * gnode)174 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
175 if (gnode->Trackable(step_)) {
176 TrackNode(gnode);
177 const TFGraphNode* node = gnode->node;
178 for (const auto& kernel_execs : node->op_execs(step_)) {
179 const string& device = kernel_execs.first;
180
181 if (process_.find(device) == process_.end()) {
182 int64 pid = AllocatePID();
183 process_[device].reset(new Process(device, pid));
184 chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
185 }
186 Process* p = process_[device].get();
187
188 for (const auto& exec : kernel_execs.second) {
189 int64 start_micros = exec.first;
190 int64 exec_micros = exec.second;
191 // TODO(xpan): There might be start time duplication here.
192 if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
193 // TODO(xpan): Give each kernel call a unique_name.
194 tnodes_[device][start_micros].reset(
195 new TimeNode(p, gnode, start_micros, exec_micros));
196 }
197 }
198 }
199 }
200 for (GraphNode* n : gnode->show_children) {
201 AllocateTimeNodes(n);
202 }
203 }
204
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)205 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
206 for (GraphNode* gnode : gnodes) {
207 AllocateTimeNodes(gnode);
208 }
209 // To save memory, we only track cross-device (canonical device) flows.
210 for (auto& process : tnodes_) {
211 if (!IsCanonicalDevice(process.first)) continue;
212 for (auto& tn : process.second) {
213 TimeNode* tnode = tn.second.get();
214 for (GraphNode* inp : tnode->node->children) {
215 if (!inp->account || !inp->Trackable(step_)) {
216 continue;
217 }
218 for (const auto& execs : inp->node->cpu_execs(step_)) {
219 if (!IsCanonicalDevice(execs.first)) continue;
220 if (process.first == execs.first) {
221 // Not interested in flow within the same device.
222 continue;
223 }
224 for (const auto& exec : execs.second) {
225 int64 start_micros = exec.first;
226 auto cprocess = tnodes_.find(execs.first);
227 if (cprocess == tnodes_.end()) continue;
228 auto ctn = cprocess->second.find(start_micros);
229 if (ctn == cprocess->second.end()) continue;
230 ctn->second->next_tnodes.push_back(tnode);
231 }
232 }
233 }
234 }
235 }
236
237 AllocateLanes();
238 fprintf(stdout, "generating trace file.\n");
239 int64 flow_id = 1;
240 for (const auto& process : alloc_nodes_) {
241 for (const auto& lane : process.second) {
242 for (const auto& node : lane.second) {
243 TimeNode* tnode = node.second;
244
245 Json::Value args(Json::objectValue);
246 args["name"] = Json::Value(tnode->name());
247 chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
248 process.first, lane.first, "Op",
249 tnode->name(), args);
250 // Flow is a directed arrow pointing from src to dst.
251 // TODO(xpan): Disable flow to reduce json file size for now. Need
252 // to think of a better way to make flow interpretable.
253 for (TimeNode* next_tnode : node.second->next_tnodes) {
254 chrome_formatter_.EmitFlowStart(
255 tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
256 process.first, lane.first, flow_id);
257 chrome_formatter_.EmitFlowEnd(
258 tnode->name() + "_flow", next_tnode->start_micros,
259 next_tnode->process->pid, next_tnode->tid, flow_id);
260 flow_id += 1;
261 }
262 }
263 }
264 }
265 for (const auto& dev : mem_tracker_.devices()) {
266 if (IsPlacedOnCPU(dev.first)) {
267 // TODO(xpan): Maybe also support CPU allocator memory tracking.
268 continue;
269 }
270 int64 pid = AllocatePID();
271 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
272 int64 pid2 = AllocatePID();
273 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
274 pid2);
275
276 const MemoryTracker::Device& device = dev.second;
277
278 int64 max_bytes_in_use = 0;
279 int64 cur_bytes_in_use = 0;
280 int64 last_point = 0;
281 for (const auto& alloc : device.allocations) {
282 cur_bytes_in_use = alloc.second;
283 max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
284 // Do not plot too dense to reduce file size.
285 int64 ts = alloc.first;
286 if (ts - last_point < 100) continue;
287 last_point = ts;
288
289 std::map<int64, std::vector<string>> tensor_mem;
290 for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
291 const auto& tensor_alloc = tensor_alloc_it.second;
292 auto it = tensor_alloc.lower_bound(ts);
293 if (it != tensor_alloc.begin()) {
294 --it;
295 }
296 if (it->second > 0) {
297 tensor_mem[it->second].push_back(tensor_alloc_it.first);
298 }
299 }
300 chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
301 dev.first, cur_bytes_in_use, tensor_mem);
302 }
303 if (IsPlacedOnAccelerator(dev.first)) {
304 fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(),
305 max_bytes_in_use / 1000000.0);
306 }
307 }
308 OutputTimeline();
309 }
310
GenerateScopeTimeline(const ScopeNode * node)311 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
312 std::set<int64> visited_depth;
313 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
314 OutputTimeline();
315 }
316
GenerateCodeTimeline(const CodeNode * node)317 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
318 std::set<int64> visited_depth;
319 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
320 OutputTimeline();
321 }
322
OutputTimeline()323 void Timeline::OutputTimeline() {
324 string outfile = strings::Printf("%s_%lld", outfile_.c_str(), step());
325 Status s =
326 WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
327 if (!s.ok()) {
328 fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
329 outfile.c_str(), s.ToString().c_str());
330 return;
331 }
332 fprintf(stdout, "\n******************************************************\n");
333 fprintf(stdout,
334 "Timeline file is written to %s.\n"
335 "Open a Chrome browser, enter URL chrome://tracing and "
336 "load the timeline file.",
337 outfile.c_str());
338 fprintf(stdout, "\n******************************************************\n");
339 fflush(stdout);
340 }
341
AllocateLanes()342 void Timeline::AllocateLanes() {
343 for (auto& process : tnodes_) {
344 Process* p = process_[process.first].get();
345 for (auto& tnode : process.second) {
346 int64 start_time = tnode.second->start_micros;
347 int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
348 int64 l = -1;
349 for (int64 i = 0; i < p->lanes.size(); ++i) {
350 const auto& lane = p->lanes[i];
351 l = i;
352 for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
353 if (cur_it->second > start_time) {
354 l = -1;
355 break;
356 }
357 if (start_time > cur_it->second) {
358 break;
359 }
360 }
361 if (l >= 0) {
362 break;
363 }
364 }
365 if (l < 0) {
366 l = p->lanes.size();
367 std::map<int64, int64> nlane;
368 nlane[start_time] = end_time;
369 p->lanes.push_back(nlane);
370 } else {
371 p->lanes[l][start_time] = end_time;
372 }
373 tnode.second->tid = l;
374 alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
375 }
376 }
377 }
378
AllocatePID()379 int64 Timeline::AllocatePID() {
380 int64 cur_pid = next_pid_;
381 next_pid_ += 1;
382 return cur_pid;
383 }
384
385 } // namespace tfprof
386 } // namespace tensorflow
387