1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/debug/debug_graph_utils.h"
17 
18 #include "tensorflow/core/common_runtime/memory_types.h"
19 #include "tensorflow/core/framework/kernel_def.pb.h"
20 #include "tensorflow/core/framework/node_def_builder.h"
21 #include "tensorflow/core/framework/op_kernel.h"
22 #include "tensorflow/core/graph/node_builder.h"
23 #include "tensorflow/core/lib/strings/strcat.h"
24 #include "tensorflow/core/protobuf/debug.pb.h"
25 
26 namespace tensorflow {
27 
28 namespace {
29 
30 // TODO(cais): Switch to safe_strtob when available.
ParseBoolString(const string & bool_str,bool * bool_val)31 Status ParseBoolString(const string& bool_str, bool* bool_val) {
32   const string lower_bool_str = absl::AsciiStrToLower(bool_str);
33   if (lower_bool_str == "false" || lower_bool_str == "f" ||
34       lower_bool_str == "0") {
35     *bool_val = false;
36   } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
37              lower_bool_str == "1") {
38     *bool_val = true;
39   } else {
40     return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
41   }
42   return Status::OK();
43 }
44 
45 }  // namespace
46 
47 // static
InsertNodes(const protobuf::RepeatedPtrField<DebugTensorWatch> & watches,Graph * graph,Device * device)48 Status DebugNodeInserter::InsertNodes(
49     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
50     Device* device) {
51   // TODO(cais): This method is getting too large in size.
52   // Refactor it with helpers.
53 
54   if (watches.empty()) {
55     // Nothing to do: Return OK right away.
56     return Status::OK();
57   }
58 
59   // Debug ops and URLs for wildcard node names (if any).
60   std::vector<string> default_debug_ops;
61   std::vector<string> default_debug_urls;
62 
63   // A map from tensor name (e.g., "node_a:0") to list of debug op names
64   // (e.g., {"DebugIdentity", "DebugNanCount"})
65   std::unordered_map<string, std::vector<string>> tensor_watches;
66   // A map from tensor name to debug_url.
67   std::unordered_map<string, std::vector<string>> tensor_watch_urls;
68   std::unordered_map<string, bool> tensor_tolerate_failures;
69 
70   // Cache the proto content for fast lookup later
71   for (const DebugTensorWatch& watch : watches) {
72     if (watch.debug_ops().empty()) {
73       continue;
74     }
75 
76     if (watch.debug_urls().empty()) {
77       continue;
78     }
79 
80     if (watch.node_name() == "*") {
81       if (watch.output_slot() == -1) {
82         default_debug_ops.insert(default_debug_ops.end(),
83                                  watch.debug_ops().begin(),
84                                  watch.debug_ops().end());
85         default_debug_urls.insert(default_debug_urls.end(),
86                                   watch.debug_urls().begin(),
87                                   watch.debug_urls().end());
88       } else {
89         return Status(error::FAILED_PRECONDITION,
90                       strings::StrCat(
91                           "output_slot is expected to be -1 for wildcard ",
92                           "node name (\"*\"), but got ", watch.output_slot()));
93       }
94       continue;
95     } else {
96       if (watch.output_slot() < 0) {
97         return Status(
98             error::FAILED_PRECONDITION,
99             strings::StrCat("A negative output_slot in DebugTensorWatch is ",
100                             "valid only for the wildcard node name (\"*\"), ",
101                             "but got node name ", watch.node_name()));
102       }
103     }
104 
105     string tensor_name =
106         strings::StrCat(watch.node_name(), ":", watch.output_slot());
107 
108     std::vector<string> debug_ops;
109     for (const string& debug_op : watch.debug_ops()) {
110       debug_ops.push_back(debug_op);
111     }
112 
113     tensor_watches[tensor_name] = debug_ops;
114     tensor_tolerate_failures[tensor_name] =
115         watch.tolerate_debug_op_creation_failures();
116 
117     std::vector<string> urls;
118     for (const string& url : watch.debug_urls()) {
119       urls.push_back(url);
120     }
121     tensor_watch_urls[tensor_name] = urls;
122   }
123 
124   if (tensor_watches.empty()) {
125     return Status::OK();
126   }
127 
128   DeviceType device_type = DeviceType{device->device_type()};
129 
130   // Keep track of all edges to be removed.
131   std::vector<const Edge*> edges_to_remove;
132 
133   for (Node* src_node : graph->nodes()) {
134     // Make a map from output slot to outgoing edges from the slot.
135     std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges;
136     for (const Edge* edge : src_node->out_edges()) {
137       const int src_output = edge->src_output();
138       if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) {
139         output_slot_to_edges[src_output] = {edge};
140       } else {
141         output_slot_to_edges[src_output].push_back(edge);
142       }
143     }
144 
145     // Iterate through all output slots of the node.
146     for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
147          ++src_output_slot) {
148       const string tensor_name =
149           strings::StrCat(src_node->name(), ":", src_output_slot);
150       const bool explicit_tensor_match =
151           tensor_watches.find(tensor_name) != tensor_watches.end();
152       if (!explicit_tensor_match && default_debug_ops.empty()) {
153         continue;
154       }
155 
156       // Now we have encountered a watched tensor. We will:
157       //   1) Mark this edge as to be removed, iff this is a non-Reference
158       //      tensor
159       //   2) Create a Copy node for the tensor
160       //   3) Add a new edge, from the source tensor to the Copy node
161       //   4) Add a new edge, from the Copy node to the destination node, iff
162       //      this is a non-Reference tensor.
163       //   5) Create all the requested debug nodes and their edges to the Copy
164       //      node.
165       //   6) Add control edges from the debug nodes to the destination nodes
166       //      to ensure that the tensors values exported by the debug nodes
167       //      to the debug URLs reflect the values before the execution of
168       //      the destination nodes.
169 
170       const DataType src_dt = src_node->output_type(src_output_slot);
171       MemoryType memory_type;
172       TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
173                                              src_output_slot, &memory_type));
174 
175       // Create the copy node for the watched tensor.
176       const std::vector<string> debug_ops = explicit_tensor_match
177                                                 ? tensor_watches[tensor_name]
178                                                 : default_debug_ops;
179       const std::vector<string> debug_urls =
180           explicit_tensor_match ? tensor_watch_urls[tensor_name]
181                                 : default_debug_urls;
182       Node* copy_node;
183       Status copy_s =
184           CreateCopyNode(graph, device_type, memory_type == HOST_MEMORY,
185                          src_node->name(), src_output_slot, src_dt, tensor_name,
186                          debug_ops, debug_urls, &copy_node);
187       if (!copy_s.ok()) {
188         return Status(
189             error::FAILED_PRECONDITION,
190             strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
191                             tensor_name, ", due to: ", copy_s.error_message()));
192       }
193 
194       // Add edge from watched tensor to the copy node.
195       graph->AddEdge(src_node, src_output_slot, copy_node, 0);
196 
197       // Create all requested debug nodes and their edges to the Copy node.
198       std::vector<Node*> debug_nodes;
199       for (size_t i = 0; i < debug_ops.size(); ++i) {
200         const string& debug_op_name = debug_ops[i];
201 
202         Node* debug_node;
203         Status debug_s = CreateDebugNode(graph, *device, copy_node->name(),
204                                          src_dt, tensor_name, debug_urls, i,
205                                          debug_op_name, &debug_node);
206         if (debug_s.ok()) {
207           graph->AddEdge(copy_node, 0, debug_node, 0);
208           debug_nodes.push_back(debug_node);
209         } else {
210           if (tensor_tolerate_failures[tensor_name]) {
211             LOG(INFO) << "Tolerating failure to create debug node: "
212                       << "tensor name = " << tensor_name << "; "
213                       << "debug op name = " << debug_op_name;
214           } else {
215             return Status(
216                 error::FAILED_PRECONDITION,
217                 strings::StrCat("Failed to create debug node ", debug_op_name,
218                                 " for tensor ", tensor_name,
219                                 ", due to: ", debug_s.error_message()));
220           }
221         }
222       }
223 
224       // Is the output a reference?
225       const bool is_ref = IsRefType(src_node->output_type(src_output_slot));
226 
227       // Iterate through all outgoing edges attached to the slot.
228       for (const Edge* edge : output_slot_to_edges[src_output_slot]) {
229         // Mark the edge for removal.
230         if (!is_ref) {
231           edges_to_remove.push_back(edge);
232           graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input());
233         }
234 
235         // Add control edges from the debug nodes to the destination node
236         // to ensure that the debug nodes are executed before the destination
237         // node. Skip Enter and NextIteration ops to avoid hanging.
238         for (Node* debug_node : debug_nodes) {
239           if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
240             graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
241                            Graph::kControlSlot);
242           }
243         }
244       }
245     }
246   }
247 
248   // Remove all edges marked for removal.
249   for (const Edge* edge : edges_to_remove) {
250     graph->RemoveEdge(edge);
251   }
252 
253   return Status::OK();
254 }
255 
DeparallelizeWhileLoops(Graph * graph,Device * device)256 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
257   bool deparallelized_a_loop = false;
258   for (Node* node : graph->nodes()) {
259     if (node->IsEnter()) {
260       const AttrValue* parallel_iterations =
261           node->attrs().Find("parallel_iterations");
262       if (parallel_iterations && parallel_iterations->i() > 1) {
263         deparallelized_a_loop = true;
264         VLOG(1) << "Changing the parallel_iterations attribute of the "
265                 << "Enter/RefEnter node \"" << node->name() << "\" on device \""
266                 << device->name() << "\" from " << parallel_iterations->i()
267                 << " to 1.";
268         node->AddAttr<int64>("parallel_iterations", 1);
269       }
270     }
271   }
272   if (deparallelized_a_loop) {
273     LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations "
274               << "attribute of all scheduled Enter/RefEnter nodes to 1. (This "
275               << "does not affect subsequent non-debug runs.)";
276   }
277 }
278 
279 // static
GetCopyNodeName(const string & node_name,const int output_slot)280 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
281                                                 const int output_slot) {
282   // For example, if the watched node is named "node1" and the output slot
283   // is 0, the debug node will be called: __copy_node1_0
284   return strings::StrCat("__copy_", node_name, "_", output_slot);
285 }
286 
287 // static
GetDebugNodeName(const string & tensor_name,const int debug_op_num,const string & debug_op_name)288 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
289                                                  const int debug_op_num,
290                                                  const string& debug_op_name) {
291   // For example, if the watched node is named "node1" and the debug op that
292   // watches the output slot of node1 is of the type "DebugNanCount", the
293   // debug node will be called: __dbg_node1_0_0_DebugNanCount.
294   return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_",
295                          debug_op_name);
296 }
297 
298 // static
CreateCopyNode(Graph * graph,const DeviceType device_type,const bool is_host_memory,const string & src_node_name,const int src_output,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_ops,const std::vector<string> & debug_urls,Node ** copy_node)299 Status DebugNodeInserter::CreateCopyNode(
300     Graph* graph, const DeviceType device_type, const bool is_host_memory,
301     const string& src_node_name, const int src_output, const DataType src_dt,
302     const string& tensor_name, const std::vector<string>& debug_ops,
303     const std::vector<string>& debug_urls, Node** copy_node) {
304   const string kGatedGrpcAttributeKey = "gated_grpc";
305 
306   NodeDef node_def;
307   const KernelDef* kdef;
308 
309   const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
310   const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
311 
312   // Cross debug_ops and debug_urls to get the list of debug ops and watches.
313   std::vector<string> debug_ops_spec;
314   for (const string& debug_op : debug_ops) {
315     for (const string& debug_url : debug_urls) {
316       string debug_op_name_proper;
317       std::unordered_map<string, string> custom_attributes;
318       TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
319                                           &custom_attributes));
320 
321       bool gated_grpc_value = false;
322       if (custom_attributes.find(kGatedGrpcAttributeKey) !=
323           custom_attributes.end()) {
324         TF_RETURN_IF_ERROR(ParseBoolString(
325             custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
326       }
327       debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
328                                                debug_url, ";",
329                                                gated_grpc_value ? "1" : "0"));
330     }
331   }
332 
333   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
334                      .Input(src_node_name, src_output, src_dt)
335                      .Attr("debug_ops_spec", debug_ops_spec);
336 
337   if (!builder.Finalize(&node_def).ok()) {
338     return Status(
339         error::FAILED_PRECONDITION,
340         strings::StrCat("Failed to create node definition ", "for copy op ",
341                         copy_node_name, " on watched tensor ", tensor_name));
342   }
343   Status s = FindKernelDef(device_type, node_def, &kdef, nullptr);
344 
345   if (!s.ok()) {
346     return Status(
347         error::FAILED_PRECONDITION,
348         strings::StrCat("Failed to find kernel definition ", "for copy op ",
349                         copy_node_name, " on watched tensor ", tensor_name));
350   }
351   if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
352     return Status(error::FAILED_PRECONDITION,
353                   strings::StrCat("Failed to create copy node ", copy_node_name,
354                                   " on watched tensor ", tensor_name));
355   }
356 
357   return Status::OK();
358 }
359 
360 // static
ParseDebugOpName(const string & debug_op_name,string * debug_op_name_proper,std::unordered_map<string,string> * attributes)361 Status DebugNodeInserter::ParseDebugOpName(
362     const string& debug_op_name, string* debug_op_name_proper,
363     std::unordered_map<string, string>* attributes) {
364   const size_t l_index = debug_op_name.find('(');
365   const size_t r_index = debug_op_name.find(')');
366   if (l_index == string::npos && r_index == string::npos) {
367     *debug_op_name_proper = debug_op_name;
368   } else {
369     if (l_index == string::npos || l_index == 0 ||
370         r_index != debug_op_name.size() - 1) {
371       return errors::InvalidArgument("Malformed debug op name \"",
372                                      debug_op_name, "\"");
373     }
374 
375     *debug_op_name_proper = debug_op_name.substr(0, l_index);
376     string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
377 
378     std::vector<string> attribute_segs = str_util::Split(arguments, ";");
379     for (const string& attribute_seg : attribute_segs) {
380       StringPiece seg(attribute_seg);
381       str_util::RemoveWhitespaceContext(&seg);
382       if (seg.empty()) {
383         continue;
384       }
385 
386       const size_t eq_index = seg.find('=');
387       if (eq_index == string::npos) {
388         return errors::InvalidArgument(
389             "Malformed attributes in debug op name \"", debug_op_name, "\"");
390       }
391 
392       const string key(seg.substr(0, eq_index));
393       const string value(
394           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
395       if (key.empty() || value.empty()) {
396         return errors::InvalidArgument(
397             "Malformed attributes in debug op name \"", debug_op_name, "\"");
398       }
399 
400       if (attributes->find(key) == attributes->end()) {
401         (*attributes)[key] = value;
402       } else {
403         return errors::InvalidArgument("Duplicate attribute name \"", key,
404                                        "\" found in the debug op: \"",
405                                        debug_op_name, "\"");
406       }
407     }
408   }
409   return Status::OK();
410 }
411 
412 // static
SetDebugNodeAttributes(Node * debug_node,const std::unordered_map<string,string> & attributes)413 Status DebugNodeInserter::SetDebugNodeAttributes(
414     Node* debug_node, const std::unordered_map<string, string>& attributes) {
415   std::unordered_set<string> unfulfilled_keys;
416   for (const auto& item : attributes) {
417     unfulfilled_keys.insert(item.first);
418   }
419 
420   for (const auto& attr : debug_node->op_def().attr()) {
421     if (attributes.find(attr.name()) != attributes.end()) {
422       const string& attr_value = attributes.at(attr.name());
423       if (attr.type() == "string") {
424         debug_node->AddAttr<string>(attr.name(), attr_value);
425       } else if (attr.type() == "float") {
426         float float_value = 0.0;
427         if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
428                                                 &float_value)) {
429           return errors::InvalidArgument(
430               "Invalid value string for float-type attribute ", attr.name(),
431               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
432         }
433         debug_node->AddAttr<float>(attr.name(), float_value);
434       } else if (attr.type() == "int") {
435         int64 int_value = 0;
436         if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
437           return errors::InvalidArgument(
438               "Invalid value string for int-type attribute ", attr.name(),
439               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
440         }
441         debug_node->AddAttr<int>(attr.name(), int_value);
442       } else if (attr.type() == "bool") {
443         bool bool_value;
444         if (!ParseBoolString(attr_value, &bool_value).ok()) {
445           return errors::InvalidArgument(
446               "Invalid value string for bool-type attribute ", attr.name(),
447               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
448         }
449         debug_node->AddAttr<bool>(attr.name(), bool_value);
450       } else {
451         return errors::InvalidArgument(
452             "Unsupported type of custom attribute for debug ops: ",
453             attr.type());
454       }
455 
456       unfulfilled_keys.erase(attr.name());
457     }
458   }
459 
460   if (unfulfilled_keys.empty()) {
461     return Status::OK();
462   } else {
463     return errors::InvalidArgument(
464         unfulfilled_keys.size(),
465         " attribute key(s) were not valid for debug node ", debug_node->name(),
466         ": ", absl::StrJoin(unfulfilled_keys, ", "));
467   }
468 }
469 
470 // static
CreateDebugNode(Graph * graph,const Device & device,const string & src_copy_node_name,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_urls,const int debug_op_num,const string & debug_op_name,Node ** debug_node)471 Status DebugNodeInserter::CreateDebugNode(
472     Graph* graph, const Device& device, const string& src_copy_node_name,
473     const DataType src_dt, const string& tensor_name,
474     const std::vector<string>& debug_urls, const int debug_op_num,
475     const string& debug_op_name, Node** debug_node) {
476   NodeDef node_def;
477   const KernelDef* kdef;
478 
479   string debug_op_name_proper;
480   std::unordered_map<string, string> custom_attributes;
481   TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
482                                       &custom_attributes));
483 
484   const string debug_node_name =
485       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
486   auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
487                      .Input(src_copy_node_name, 0, src_dt)
488                      .Attr("device_name", device.name())
489                      .Attr("tensor_name", tensor_name)
490                      .Attr("debug_urls", debug_urls);
491 
492   if (!builder.Finalize(&node_def).ok()) {
493     return errors::FailedPrecondition(
494         "Failed to create node definition for debug op ", debug_op_name_proper,
495         " on watched tensor ", tensor_name);
496   }
497   if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
498            .ok()) {
499     return errors::FailedPrecondition(
500         "Failed to find kernel definition for debug op ", debug_op_name_proper,
501         " on watched tensor ", tensor_name);
502   }
503   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
504     return errors::FailedPrecondition("Failed to create debug node ",
505                                       debug_op_name_proper,
506                                       " on watched tensor ", tensor_name);
507   }
508 
509   // Set custom attributes (if any).
510   if (!custom_attributes.empty()) {
511     TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
512   }
513 
514   return Status::OK();
515 }
516 
517 }  // namespace tensorflow
518