1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17 
18 #include <utility>
19 
20 #include "tensorflow/core/lib/core/status.h"
21 #include "tensorflow/core/lib/strings/str_util.h"
22 #include "tensorflow/core/lib/strings/stringprintf.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24 
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29 
GetTimeDevName(const string & dev)30 string GetTimeDevName(const string& dev) {
31   if (dev.find("stream") != dev.npos) {
32     return strings::StrCat("Op execution threads: ", dev);
33   } else {
34     return strings::StrCat("Op scheduling threads: ", dev);
35   }
36 }
GetMemoryLaneName(const string & dev)37 string GetMemoryLaneName(const string& dev) {
38   return strings::StrCat("mem usage on:", dev);
39 }
40 }  // namespace
41 
CreateEvent(const string & ph,const string & category,const string & name,int64 pid,int64 tid,int64 ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43                                               const string& category,
44                                               const string& name, int64 pid,
45                                               int64 tid, int64 ts) {
46   Json::Value event(Json::objectValue);
47   event["ph"] = Json::Value(ph);
48   event["cat"] = Json::Value(category);
49   event["name"] = Json::Value(name);
50   event["pid"] = Json::Int64(pid);
51   event["tid"] = Json::Int64(tid);
52   event["ts"] = Json::Int64(ts);
53   return event;
54 }
55 
EmitPID(const string & name,int64 pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
57   Json::Value event(Json::objectValue);
58   event["name"] = Json::Value("process_name");
59   event["ph"] = Json::Value("M");
60   event["pid"] = Json::Int64(pid);
61   Json::Value args(Json::objectValue);
62   args["name"] = Json::Value(name);
63   event["args"] = args;
64   metadata_.push_back(event);
65 }
66 
EmitRegion(int64 ts,int64 duration,int64 pid,int64 tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
68                                       int64 tid, const string& category,
69                                       const string& name, Json::Value args) {
70   Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71   event["dur"] = Json::Int64(duration);
72   event["args"] = std::move(args);
73   metadata_.push_back(event);
74 }
75 
EmitFlowStart(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
77                                          int64 pid, int64 tid, int64 flow_id) {
78   Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
79   event["id"] = Json::Int64(flow_id);
80   events_.push_back(event);
81 }
82 
EmitFlowEnd(const string & name,int64 ts,int64 pid,int64 tid,int64 flow_id)83 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
84                                        int64 tid, int64 flow_id) {
85   Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
86   event["id"] = Json::Int64(flow_id);
87   events_.push_back(event);
88 }
89 
EmitCounter(const string & category,const string & name,int64 pid,int64 ts,const string & device,int64 bytes,const std::map<int64,std::vector<string>> & tensor_mem)90 void ChromeTraceFormatter::EmitCounter(
91     const string& category, const string& name, int64 pid, int64 ts,
92     const string& device, int64 bytes,
93     const std::map<int64, std::vector<string>>& tensor_mem) {
94   Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
95   Json::Value args(Json::objectValue);
96   args["Allocator Bytes in Use"] = Json::Int64(bytes);
97   event["args"] = args;
98   events_.push_back(event);
99 
100   // TODO(xpan): chrome://tracing is not ideal visualization for memory.
101   // It would be great to have a customized UI for it.
102   Json::Value event2 =
103       CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
104   Json::Value args2(Json::objectValue);
105   // Need to reserve the same args for all locations.
106   for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
107     args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A");
108   }
109   int count = 0;
110   for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
111     for (const string& t : it->second) {
112       if (bytes < it->first || count >= kMaxDisplayedMemNode) {
113         break;
114       }
115       args2[strings::Printf("Top Allocation %02d", count)] =
116           Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t));
117       ++count;
118       bytes -= it->first;
119     }
120   }
121   args2[strings::StrCat("Not Displayed")] =
122       Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0));
123   event2["args"] = args2;
124   events_.push_back(event2);
125 }
126 
Format()127 string ChromeTraceFormatter::Format() {
128   Json::Value trace;
129   trace["traceEvents"] = Json::Value(Json::arrayValue);
130   for (const Json::Value& v : metadata_) {
131     trace["traceEvents"].append(v);
132   }
133   for (const Json::Value& v : events_) {
134     trace["traceEvents"].append(v);
135   }
136   Json::FastWriter writer;
137   string trace_str = writer.write(trace);
138   if (trace_str.length() > 200 * 1024 * 1024) {
139     fprintf(stderr,
140             "Trace file is over 200MB. Chrome might not be able to "
141             "display it. Consider to use filters (e.g. -min_micros "
142             "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
143   }
144   return trace_str;
145 }
146 
TrackNode(int64 step,const GraphNode * node)147 void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
148   if (!node->Trackable(step)) {
149     return;
150   }
151 
152   Device& dev = devices_[node->node->canonical_device()];
153 
154   std::map<int64, int64> allocs;
155   for (const auto& alloc : node->node->allocations(step)) {
156     allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
157     dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
158   }
159   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
160   allocs[0] += node->node->accelerator_persistent_bytes();
161 
162   int64 last = 0;
163   std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
164   for (auto it = allocs.begin(); it != allocs.end(); ++it) {
165     last += it->second;
166     aggregate_allocs[it->first] = last;
167   }
168   for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
169     if (bytes_in_use.first <= 0) continue;
170     dev.allocations[bytes_in_use.first] = bytes_in_use.second;
171   }
172 }
173 
AllocateTimeNodes(GraphNode * gnode)174 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
175   if (gnode->Trackable(step_)) {
176     TrackNode(gnode);
177     const TFGraphNode* node = gnode->node;
178     for (const auto& kernel_execs : node->op_execs(step_)) {
179       const string& device = kernel_execs.first;
180 
181       if (process_.find(device) == process_.end()) {
182         int64 pid = AllocatePID();
183         process_[device].reset(new Process(device, pid));
184         chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
185       }
186       Process* p = process_[device].get();
187 
188       for (const auto& exec : kernel_execs.second) {
189         int64 start_micros = exec.first;
190         int64 exec_micros = exec.second;
191         // TODO(xpan): There might be start time duplication here.
192         if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
193           // TODO(xpan): Give each kernel call a unique_name.
194           tnodes_[device][start_micros].reset(
195               new TimeNode(p, gnode, start_micros, exec_micros));
196         }
197       }
198     }
199   }
200   for (GraphNode* n : gnode->show_children) {
201     AllocateTimeNodes(n);
202   }
203 }
204 
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)205 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
206   for (GraphNode* gnode : gnodes) {
207     AllocateTimeNodes(gnode);
208   }
209   // To save memory, we only track cross-device (canonical device) flows.
210   for (auto& process : tnodes_) {
211     if (!IsCanonicalDevice(process.first)) continue;
212     for (auto& tn : process.second) {
213       TimeNode* tnode = tn.second.get();
214       for (GraphNode* inp : tnode->node->children) {
215         if (!inp->account || !inp->Trackable(step_)) {
216           continue;
217         }
218         for (const auto& execs : inp->node->cpu_execs(step_)) {
219           if (!IsCanonicalDevice(execs.first)) continue;
220           if (process.first == execs.first) {
221             // Not interested in flow within the same device.
222             continue;
223           }
224           for (const auto& exec : execs.second) {
225             int64 start_micros = exec.first;
226             auto cprocess = tnodes_.find(execs.first);
227             if (cprocess == tnodes_.end()) continue;
228             auto ctn = cprocess->second.find(start_micros);
229             if (ctn == cprocess->second.end()) continue;
230             ctn->second->next_tnodes.push_back(tnode);
231           }
232         }
233       }
234     }
235   }
236 
237   AllocateLanes();
238   fprintf(stdout, "generating trace file.\n");
239   int64 flow_id = 1;
240   for (const auto& process : alloc_nodes_) {
241     for (const auto& lane : process.second) {
242       for (const auto& node : lane.second) {
243         TimeNode* tnode = node.second;
244 
245         Json::Value args(Json::objectValue);
246         args["name"] = Json::Value(tnode->name());
247         chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
248                                      process.first, lane.first, "Op",
249                                      tnode->name(), args);
250         // Flow is a directed arrow pointing from src to dst.
251         // TODO(xpan): Disable flow to reduce json file size for now. Need
252         // to think of a better way to make flow interpretable.
253         for (TimeNode* next_tnode : node.second->next_tnodes) {
254           chrome_formatter_.EmitFlowStart(
255               tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
256               process.first, lane.first, flow_id);
257           chrome_formatter_.EmitFlowEnd(
258               tnode->name() + "_flow", next_tnode->start_micros,
259               next_tnode->process->pid, next_tnode->tid, flow_id);
260           flow_id += 1;
261         }
262       }
263     }
264   }
265   for (const auto& dev : mem_tracker_.devices()) {
266     if (IsPlacedOnCPU(dev.first)) {
267       // TODO(xpan): Maybe also support CPU allocator memory tracking.
268       continue;
269     }
270     int64 pid = AllocatePID();
271     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
272     int64 pid2 = AllocatePID();
273     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
274                               pid2);
275 
276     const MemoryTracker::Device& device = dev.second;
277 
278     int64 max_bytes_in_use = 0;
279     int64 cur_bytes_in_use = 0;
280     int64 last_point = 0;
281     for (const auto& alloc : device.allocations) {
282       cur_bytes_in_use = alloc.second;
283       max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
284       // Do not plot too dense to reduce file size.
285       int64 ts = alloc.first;
286       if (ts - last_point < 100) continue;
287       last_point = ts;
288 
289       std::map<int64, std::vector<string>> tensor_mem;
290       for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
291         const auto& tensor_alloc = tensor_alloc_it.second;
292         auto it = tensor_alloc.lower_bound(ts);
293         if (it != tensor_alloc.begin()) {
294           --it;
295         }
296         if (it->second > 0) {
297           tensor_mem[it->second].push_back(tensor_alloc_it.first);
298         }
299       }
300       chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
301                                     dev.first, cur_bytes_in_use, tensor_mem);
302     }
303     if (IsPlacedOnAccelerator(dev.first)) {
304       fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(),
305               max_bytes_in_use / 1000000.0);
306     }
307   }
308   OutputTimeline();
309 }
310 
GenerateScopeTimeline(const ScopeNode * node)311 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
312   std::set<int64> visited_depth;
313   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
314   OutputTimeline();
315 }
316 
GenerateCodeTimeline(const CodeNode * node)317 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
318   std::set<int64> visited_depth;
319   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
320   OutputTimeline();
321 }
322 
OutputTimeline()323 void Timeline::OutputTimeline() {
324   string outfile = strings::Printf("%s_%lld", outfile_.c_str(), step());
325   Status s =
326       WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
327   if (!s.ok()) {
328     fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
329             outfile.c_str(), s.ToString().c_str());
330     return;
331   }
332   fprintf(stdout, "\n******************************************************\n");
333   fprintf(stdout,
334           "Timeline file is written to %s.\n"
335           "Open a Chrome browser, enter URL chrome://tracing and "
336           "load the timeline file.",
337           outfile.c_str());
338   fprintf(stdout, "\n******************************************************\n");
339   fflush(stdout);
340 }
341 
AllocateLanes()342 void Timeline::AllocateLanes() {
343   for (auto& process : tnodes_) {
344     Process* p = process_[process.first].get();
345     for (auto& tnode : process.second) {
346       int64 start_time = tnode.second->start_micros;
347       int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
348       int64 l = -1;
349       for (int64 i = 0; i < p->lanes.size(); ++i) {
350         const auto& lane = p->lanes[i];
351         l = i;
352         for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
353           if (cur_it->second > start_time) {
354             l = -1;
355             break;
356           }
357           if (start_time > cur_it->second) {
358             break;
359           }
360         }
361         if (l >= 0) {
362           break;
363         }
364       }
365       if (l < 0) {
366         l = p->lanes.size();
367         std::map<int64, int64> nlane;
368         nlane[start_time] = end_time;
369         p->lanes.push_back(nlane);
370       } else {
371         p->lanes[l][start_time] = end_time;
372       }
373       tnode.second->tid = l;
374       alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
375     }
376   }
377 }
378 
AllocatePID()379 int64 Timeline::AllocatePID() {
380   int64 cur_pid = next_pid_;
381   next_pid_ += 1;
382   return cur_pid;
383 }
384 
385 }  // namespace tfprof
386 }  // namespace tensorflow
387