1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/debug/debug_graph_utils.h"
17 
18 #include "tensorflow/core/common_runtime/memory_types.h"
19 #include "tensorflow/core/framework/kernel_def.pb.h"
20 #include "tensorflow/core/framework/node_def_builder.h"
21 #include "tensorflow/core/framework/op_kernel.h"
22 #include "tensorflow/core/graph/node_builder.h"
23 #include "tensorflow/core/lib/strings/strcat.h"
24 #include "tensorflow/core/protobuf/debug.pb.h"
25 
26 namespace tensorflow {
27 
28 namespace {
29 
30 // TODO(cais): Switch to safe_strtob when available.
ParseBoolString(const string & bool_str,bool * bool_val)31 Status ParseBoolString(const string& bool_str, bool* bool_val) {
32   const string lower_bool_str = str_util::Lowercase(bool_str);
33   if (lower_bool_str == "false" || lower_bool_str == "f" ||
34       lower_bool_str == "0") {
35     *bool_val = false;
36   } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
37              lower_bool_str == "1") {
38     *bool_val = true;
39   } else {
40     return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
41   }
42   return Status::OK();
43 }
44 
45 }  // namespace
46 
47 // static
InsertNodes(const protobuf::RepeatedPtrField<DebugTensorWatch> & watches,Graph * graph,Device * device)48 Status DebugNodeInserter::InsertNodes(
49     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
50     Device* device) {
51   // TODO(cais): This method is getting too large in size.
52   // Refactor it with helpers.
53 
54   if (watches.empty()) {
55     // Nothing to do: Return OK right away.
56     return Status::OK();
57   }
58 
59   // A map from tensor name (e.g., "node_a:0") to list of debug op names
60   // (e.g., {"DebugIdentity", "DebugNanCount"})
61   std::unordered_map<string, std::vector<string>> tensor_watches;
62   // A map from tensor name to debug_url.
63   std::unordered_map<string, std::vector<string>> tensor_watch_urls;
64   std::unordered_map<string, bool> tensor_tolerate_failures;
65 
66   // Cache the proto content for fast lookup later
67   for (const DebugTensorWatch& watch : watches) {
68     if (watch.output_slot() < 0) {
69       // The semantics of output_slot == -1 is that the node is watched only
70       // for completion, but not for output tensor values (see
71       // NodeCompletionCallback in debug_gateway.h).
72       continue;
73     }
74     if (watch.debug_ops().empty()) {
75       continue;
76     }
77 
78     string tensor_name =
79         strings::StrCat(watch.node_name(), ":", watch.output_slot());
80 
81     std::vector<string> debug_ops;
82     for (const string& debug_op : watch.debug_ops()) {
83       debug_ops.push_back(debug_op);
84     }
85 
86     tensor_watches[tensor_name] = debug_ops;
87     tensor_tolerate_failures[tensor_name] =
88         watch.tolerate_debug_op_creation_failures();
89 
90     std::vector<string> urls;
91     for (const string& url : watch.debug_urls()) {
92       urls.push_back(url);
93     }
94     tensor_watch_urls[tensor_name] = urls;
95   }
96 
97   if (tensor_watches.empty()) {
98     return Status::OK();
99   }
100 
101   DeviceType device_type = DeviceType{device->device_type()};
102 
103   // Keep track of all edges to be removed.
104   std::vector<const Edge*> edges_to_remove;
105 
106   for (Node* src_node : graph->nodes()) {
107     // Make a map from output slot to outgoing edges from the slot.
108     std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges;
109     for (const Edge* edge : src_node->out_edges()) {
110       const int src_output = edge->src_output();
111       if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) {
112         output_slot_to_edges[src_output] = {edge};
113       } else {
114         output_slot_to_edges[src_output].push_back(edge);
115       }
116     }
117 
118     // Iterate through all output slots of the node.
119     for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
120          ++src_output_slot) {
121       const string tensor_name =
122           strings::StrCat(src_node->name(), ":", src_output_slot);
123       if (tensor_watches.find(tensor_name) == tensor_watches.end()) {
124         // Add debug nodes only for edges with matching source node and source
125         // output slot.
126         continue;
127       }
128 
129       // Now we have encountered a watched tensor. We will:
130       //   1) Mark this edge as to be removed, iff this is a non-Reference
131       //      tensor
132       //   2) Create a Copy node for the tensor
133       //   3) Add a new edge, from the source tensor to the Copy node
134       //   4) Add a new edge, from the Copy node to the destination node, iff
135       //      this is a non-Reference tensor.
136       //   5) Create all the requested debug nodes and their edges to the Copy
137       //      node.
138       //   6) Add control edges from the debug nodes to the destination nodes
139       //      to ensure that the tensors values exported by the debug nodes
140       //      to the debug URLs reflect the values before the execution of
141       //      the destination nodes.
142 
143       const DataType src_dt = src_node->output_type(src_output_slot);
144       MemoryType memory_type;
145       TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
146                                              src_output_slot, &memory_type));
147 
148       // Create the copy node for the watched tensor.
149       Node* copy_node;
150       Status copy_s = CreateCopyNode(
151           graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
152           src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
153           tensor_watch_urls[tensor_name], &copy_node);
154       if (!copy_s.ok()) {
155         return Status(
156             error::FAILED_PRECONDITION,
157             strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
158                             tensor_name, ", due to: ", copy_s.error_message()));
159       }
160 
161       // Add edge from watched tensor to the copy node.
162       graph->AddEdge(src_node, src_output_slot, copy_node, 0);
163 
164       // Create all requested debug nodes and their edges to the Copy node.
165       std::vector<Node*> debug_nodes;
166       for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) {
167         const string& debug_op_name = tensor_watches[tensor_name][i];
168 
169         Node* debug_node;
170         Status debug_s = CreateDebugNode(
171             graph, *device, copy_node->name(), src_dt, tensor_name,
172             tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
173         if (debug_s.ok()) {
174           graph->AddEdge(copy_node, 0, debug_node, 0);
175           debug_nodes.push_back(debug_node);
176         } else {
177           if (tensor_tolerate_failures[tensor_name]) {
178             LOG(INFO) << "Tolerating failure to create debug node: "
179                       << "tensor name = " << tensor_name << "; "
180                       << "debug op name = " << debug_op_name;
181           } else {
182             return Status(
183                 error::FAILED_PRECONDITION,
184                 strings::StrCat("Failed to create debug node ", debug_op_name,
185                                 " for tensor ", tensor_name,
186                                 ", due to: ", debug_s.error_message()));
187           }
188         }
189       }
190 
191       // Is the output a reference?
192       const bool is_ref = IsRefType(src_node->output_type(src_output_slot));
193 
194       // Iterate through all outgoing edges attached to the slot.
195       for (const Edge* edge : output_slot_to_edges[src_output_slot]) {
196         // Mark the edge for removal.
197         if (!is_ref) {
198           edges_to_remove.push_back(edge);
199           graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input());
200         }
201 
202         // Add control edges from the debug nodes to the destination node
203         // to ensure that the debug nodes are executed before the destination
204         // node. Skip Enter and NextIteration ops to avoid hanging.
205         for (Node* debug_node : debug_nodes) {
206           if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
207             graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
208                            Graph::kControlSlot);
209           }
210         }
211       }
212     }
213   }
214 
215   // Remove all edges marked for removal.
216   for (const Edge* edge : edges_to_remove) {
217     graph->RemoveEdge(edge);
218   }
219 
220   return Status::OK();
221 }
222 
DeparallelizeWhileLoops(Graph * graph,Device * device)223 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
224   bool deparallelized_a_loop = false;
225   for (Node* node : graph->nodes()) {
226     if (node->IsEnter()) {
227       const AttrValue* parallel_iterations =
228           node->attrs().Find("parallel_iterations");
229       if (parallel_iterations && parallel_iterations->i() > 1) {
230         deparallelized_a_loop = true;
231         VLOG(1) << "Changing the parallel_iterations attribute of the "
232                 << "Enter/RefEnter node \"" << node->name() << "\" on device \""
233                 << device->name() << "\" from " << parallel_iterations->i()
234                 << " to 1.";
235         node->AddAttr<int64>("parallel_iterations", 1);
236       }
237     }
238   }
239   if (deparallelized_a_loop) {
240     LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations "
241               << "attribute of all scheduled Enter/RefEnter nodes to 1. (This "
242               << "does not affect subsequent non-debug runs.)";
243   }
244 }
245 
246 // static
GetCopyNodeName(const string & node_name,const int output_slot)247 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
248                                                 const int output_slot) {
249   // For example, if the watched node is named "node1" and the output slot
250   // is 0, the debug node will be called: __copy_node1_0
251   return strings::StrCat("__copy_", node_name, "_", output_slot);
252 }
253 
254 // static
GetDebugNodeName(const string & tensor_name,const int debug_op_num,const string & debug_op_name)255 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
256                                                  const int debug_op_num,
257                                                  const string& debug_op_name) {
258   // For example, if the watched node is named "node1" and the debug op that
259   // watches the output slot of node1 is of the type "DebugNanCount", the
260   // debug node will be called: __dbg_node1_0_0_DebugNanCount.
261   return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_",
262                          debug_op_name);
263 }
264 
265 // static
CreateCopyNode(Graph * graph,const DeviceType device_type,const bool is_host_memory,const string & src_node_name,const int src_output,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_ops,const std::vector<string> & debug_urls,Node ** copy_node)266 Status DebugNodeInserter::CreateCopyNode(
267     Graph* graph, const DeviceType device_type, const bool is_host_memory,
268     const string& src_node_name, const int src_output, const DataType src_dt,
269     const string& tensor_name, const std::vector<string>& debug_ops,
270     const std::vector<string>& debug_urls, Node** copy_node) {
271   const string kGatedGrpcAttributeKey = "gated_grpc";
272 
273   NodeDef node_def;
274   const KernelDef* kdef;
275 
276   const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
277   const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
278 
279   // Cross debug_ops and debug_urls to get the list of debug ops and watches.
280   std::vector<string> debug_ops_spec;
281   for (const string& debug_op : debug_ops) {
282     for (const string& debug_url : debug_urls) {
283       string debug_op_name_proper;
284       std::unordered_map<string, string> custom_attributes;
285       TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
286                                           &custom_attributes));
287 
288       bool gated_grpc_value = false;
289       if (custom_attributes.find(kGatedGrpcAttributeKey) !=
290           custom_attributes.end()) {
291         TF_RETURN_IF_ERROR(ParseBoolString(
292             custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
293       }
294       debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
295                                                debug_url, ";",
296                                                gated_grpc_value ? "1" : "0"));
297     }
298   }
299 
300   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
301                      .Input(src_node_name, src_output, src_dt)
302                      .Attr("debug_ops_spec", debug_ops_spec);
303 
304   if (!builder.Finalize(&node_def).ok()) {
305     return Status(
306         error::FAILED_PRECONDITION,
307         strings::StrCat("Failed to create node definition ", "for copy op ",
308                         copy_node_name, " on watched tensor ", tensor_name));
309   }
310   Status s = FindKernelDef(device_type, node_def, &kdef, nullptr);
311 
312   if (!s.ok()) {
313     return Status(
314         error::FAILED_PRECONDITION,
315         strings::StrCat("Failed to find kernel definition ", "for copy op ",
316                         copy_node_name, " on watched tensor ", tensor_name));
317   }
318   if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
319     return Status(error::FAILED_PRECONDITION,
320                   strings::StrCat("Failed to create copy node ", copy_node_name,
321                                   " on watched tensor ", tensor_name));
322   }
323 
324   return Status::OK();
325 }
326 
327 // static
ParseDebugOpName(const string & debug_op_name,string * debug_op_name_proper,std::unordered_map<string,string> * attributes)328 Status DebugNodeInserter::ParseDebugOpName(
329     const string& debug_op_name, string* debug_op_name_proper,
330     std::unordered_map<string, string>* attributes) {
331   const size_t l_index = debug_op_name.find('(');
332   const size_t r_index = debug_op_name.find(')');
333   if (l_index == string::npos && r_index == string::npos) {
334     *debug_op_name_proper = debug_op_name;
335   } else {
336     if (l_index == string::npos || l_index == 0 ||
337         r_index != debug_op_name.size() - 1) {
338       return errors::InvalidArgument("Malformed debug op name \"",
339                                      debug_op_name, "\"");
340     }
341 
342     *debug_op_name_proper = debug_op_name.substr(0, l_index);
343     string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
344 
345     std::vector<string> attribute_segs = str_util::Split(arguments, ";");
346     for (const string& attribute_seg : attribute_segs) {
347       StringPiece seg(attribute_seg);
348       str_util::RemoveWhitespaceContext(&seg);
349       if (seg.empty()) {
350         continue;
351       }
352 
353       const size_t eq_index = seg.find('=');
354       if (eq_index == string::npos) {
355         return errors::InvalidArgument(
356             "Malformed attributes in debug op name \"", debug_op_name, "\"");
357       }
358 
359       const string key(seg.substr(0, eq_index));
360       const string value(
361           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
362       if (key.empty() || value.empty()) {
363         return errors::InvalidArgument(
364             "Malformed attributes in debug op name \"", debug_op_name, "\"");
365       }
366 
367       if (attributes->find(key) == attributes->end()) {
368         (*attributes)[key] = value;
369       } else {
370         return errors::InvalidArgument("Duplicate attribute name \"", key,
371                                        "\" found in the debug op: \"",
372                                        debug_op_name, "\"");
373       }
374     }
375   }
376   return Status::OK();
377 }
378 
379 // static
SetDebugNodeAttributes(Node * debug_node,const std::unordered_map<string,string> & attributes)380 Status DebugNodeInserter::SetDebugNodeAttributes(
381     Node* debug_node, const std::unordered_map<string, string>& attributes) {
382   std::unordered_set<string> unfulfilled_keys;
383   for (const auto& item : attributes) {
384     unfulfilled_keys.insert(item.first);
385   }
386 
387   for (const auto& attr : debug_node->op_def().attr()) {
388     if (attributes.find(attr.name()) != attributes.end()) {
389       const string& attr_value = attributes.at(attr.name());
390       if (attr.type() == "string") {
391         debug_node->AddAttr<string>(attr.name(), attr_value);
392       } else if (attr.type() == "float") {
393         float float_value = 0.0;
394         if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
395                                                 &float_value)) {
396           return errors::InvalidArgument(
397               "Invalid value string for float-type attribute ", attr.name(),
398               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
399         }
400         debug_node->AddAttr<float>(attr.name(), float_value);
401       } else if (attr.type() == "int") {
402         int64 int_value = 0;
403         if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
404           return errors::InvalidArgument(
405               "Invalid value string for int-type attribute ", attr.name(),
406               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
407         }
408         debug_node->AddAttr<int>(attr.name(), int_value);
409       } else if (attr.type() == "bool") {
410         bool bool_value;
411         if (!ParseBoolString(attr_value, &bool_value).ok()) {
412           return errors::InvalidArgument(
413               "Invalid value string for bool-type attribute ", attr.name(),
414               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
415         }
416         debug_node->AddAttr<bool>(attr.name(), bool_value);
417       } else {
418         return errors::InvalidArgument(
419             "Unsupported type of custom attribute for debug ops: ",
420             attr.type());
421       }
422 
423       unfulfilled_keys.erase(attr.name());
424     }
425   }
426 
427   if (unfulfilled_keys.empty()) {
428     return Status::OK();
429   } else {
430     return errors::InvalidArgument(
431         unfulfilled_keys.size(),
432         " attribute key(s) were not valid for debug node ", debug_node->name(),
433         ": ", str_util::Join(unfulfilled_keys, ", "));
434   }
435 }
436 
437 // static
CreateDebugNode(Graph * graph,const Device & device,const string & src_copy_node_name,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_urls,const int debug_op_num,const string & debug_op_name,Node ** debug_node)438 Status DebugNodeInserter::CreateDebugNode(
439     Graph* graph, const Device& device, const string& src_copy_node_name,
440     const DataType src_dt, const string& tensor_name,
441     const std::vector<string>& debug_urls, const int debug_op_num,
442     const string& debug_op_name, Node** debug_node) {
443   NodeDef node_def;
444   const KernelDef* kdef;
445 
446   string debug_op_name_proper;
447   std::unordered_map<string, string> custom_attributes;
448   TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
449                                       &custom_attributes));
450 
451   const string debug_node_name =
452       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
453   auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
454                      .Input(src_copy_node_name, 0, src_dt)
455                      .Attr("device_name", device.name())
456                      .Attr("tensor_name", tensor_name)
457                      .Attr("debug_urls", debug_urls);
458 
459   if (!builder.Finalize(&node_def).ok()) {
460     return errors::FailedPrecondition(
461         "Failed to create node definition for debug op ", debug_op_name_proper,
462         " on watched tensor ", tensor_name);
463   }
464   if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
465            .ok()) {
466     return errors::FailedPrecondition(
467         "Failed to find kernel definition for debug op ", debug_op_name_proper,
468         " on watched tensor ", tensor_name);
469   }
470   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
471     return errors::FailedPrecondition("Failed to create debug node ",
472                                       debug_op_name_proper,
473                                       " on watched tensor ", tensor_name);
474   }
475 
476   // Set custom attributes (if any).
477   if (!custom_attributes.empty()) {
478     TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
479   }
480 
481   return Status::OK();
482 }
483 
484 }  // namespace tensorflow
485