1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/debug/debug_graph_utils.h"
17
18 #include "tensorflow/core/common_runtime/memory_types.h"
19 #include "tensorflow/core/framework/kernel_def.pb.h"
20 #include "tensorflow/core/framework/node_def_builder.h"
21 #include "tensorflow/core/framework/op_kernel.h"
22 #include "tensorflow/core/graph/node_builder.h"
23 #include "tensorflow/core/lib/strings/strcat.h"
24 #include "tensorflow/core/protobuf/debug.pb.h"
25
26 namespace tensorflow {
27
28 namespace {
29
30 // TODO(cais): Switch to safe_strtob when available.
ParseBoolString(const string & bool_str,bool * bool_val)31 Status ParseBoolString(const string& bool_str, bool* bool_val) {
32 const string lower_bool_str = str_util::Lowercase(bool_str);
33 if (lower_bool_str == "false" || lower_bool_str == "f" ||
34 lower_bool_str == "0") {
35 *bool_val = false;
36 } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
37 lower_bool_str == "1") {
38 *bool_val = true;
39 } else {
40 return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
41 }
42 return Status::OK();
43 }
44
45 } // namespace
46
47 // static
InsertNodes(const protobuf::RepeatedPtrField<DebugTensorWatch> & watches,Graph * graph,Device * device)48 Status DebugNodeInserter::InsertNodes(
49 const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
50 Device* device) {
51 // TODO(cais): This method is getting too large in size.
52 // Refactor it with helpers.
53
54 if (watches.empty()) {
55 // Nothing to do: Return OK right away.
56 return Status::OK();
57 }
58
59 // A map from tensor name (e.g., "node_a:0") to list of debug op names
60 // (e.g., {"DebugIdentity", "DebugNanCount"})
61 std::unordered_map<string, std::vector<string>> tensor_watches;
62 // A map from tensor name to debug_url.
63 std::unordered_map<string, std::vector<string>> tensor_watch_urls;
64 std::unordered_map<string, bool> tensor_tolerate_failures;
65
66 // Cache the proto content for fast lookup later
67 for (const DebugTensorWatch& watch : watches) {
68 if (watch.output_slot() < 0) {
69 // The semantics of output_slot == -1 is that the node is watched only
70 // for completion, but not for output tensor values (see
71 // NodeCompletionCallback in debug_gateway.h).
72 continue;
73 }
74 if (watch.debug_ops().empty()) {
75 continue;
76 }
77
78 string tensor_name =
79 strings::StrCat(watch.node_name(), ":", watch.output_slot());
80
81 std::vector<string> debug_ops;
82 for (const string& debug_op : watch.debug_ops()) {
83 debug_ops.push_back(debug_op);
84 }
85
86 tensor_watches[tensor_name] = debug_ops;
87 tensor_tolerate_failures[tensor_name] =
88 watch.tolerate_debug_op_creation_failures();
89
90 std::vector<string> urls;
91 for (const string& url : watch.debug_urls()) {
92 urls.push_back(url);
93 }
94 tensor_watch_urls[tensor_name] = urls;
95 }
96
97 if (tensor_watches.empty()) {
98 return Status::OK();
99 }
100
101 DeviceType device_type = DeviceType{device->device_type()};
102
103 // Keep track of all edges to be removed.
104 std::vector<const Edge*> edges_to_remove;
105
106 for (Node* src_node : graph->nodes()) {
107 // Make a map from output slot to outgoing edges from the slot.
108 std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges;
109 for (const Edge* edge : src_node->out_edges()) {
110 const int src_output = edge->src_output();
111 if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) {
112 output_slot_to_edges[src_output] = {edge};
113 } else {
114 output_slot_to_edges[src_output].push_back(edge);
115 }
116 }
117
118 // Iterate through all output slots of the node.
119 for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
120 ++src_output_slot) {
121 const string tensor_name =
122 strings::StrCat(src_node->name(), ":", src_output_slot);
123 if (tensor_watches.find(tensor_name) == tensor_watches.end()) {
124 // Add debug nodes only for edges with matching source node and source
125 // output slot.
126 continue;
127 }
128
129 // Now we have encountered a watched tensor. We will:
130 // 1) Mark this edge as to be removed, iff this is a non-Reference
131 // tensor
132 // 2) Create a Copy node for the tensor
133 // 3) Add a new edge, from the source tensor to the Copy node
134 // 4) Add a new edge, from the Copy node to the destination node, iff
135 // this is a non-Reference tensor.
136 // 5) Create all the requested debug nodes and their edges to the Copy
137 // node.
138 // 6) Add control edges from the debug nodes to the destination nodes
139 // to ensure that the tensors values exported by the debug nodes
140 // to the debug URLs reflect the values before the execution of
141 // the destination nodes.
142
143 const DataType src_dt = src_node->output_type(src_output_slot);
144 MemoryType memory_type;
145 TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
146 src_output_slot, &memory_type));
147
148 // Create the copy node for the watched tensor.
149 Node* copy_node;
150 Status copy_s = CreateCopyNode(
151 graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
152 src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
153 tensor_watch_urls[tensor_name], ©_node);
154 if (!copy_s.ok()) {
155 return Status(
156 error::FAILED_PRECONDITION,
157 strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
158 tensor_name, ", due to: ", copy_s.error_message()));
159 }
160
161 // Add edge from watched tensor to the copy node.
162 graph->AddEdge(src_node, src_output_slot, copy_node, 0);
163
164 // Create all requested debug nodes and their edges to the Copy node.
165 std::vector<Node*> debug_nodes;
166 for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) {
167 const string& debug_op_name = tensor_watches[tensor_name][i];
168
169 Node* debug_node;
170 Status debug_s = CreateDebugNode(
171 graph, *device, copy_node->name(), src_dt, tensor_name,
172 tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
173 if (debug_s.ok()) {
174 graph->AddEdge(copy_node, 0, debug_node, 0);
175 debug_nodes.push_back(debug_node);
176 } else {
177 if (tensor_tolerate_failures[tensor_name]) {
178 LOG(INFO) << "Tolerating failure to create debug node: "
179 << "tensor name = " << tensor_name << "; "
180 << "debug op name = " << debug_op_name;
181 } else {
182 return Status(
183 error::FAILED_PRECONDITION,
184 strings::StrCat("Failed to create debug node ", debug_op_name,
185 " for tensor ", tensor_name,
186 ", due to: ", debug_s.error_message()));
187 }
188 }
189 }
190
191 // Is the output a reference?
192 const bool is_ref = IsRefType(src_node->output_type(src_output_slot));
193
194 // Iterate through all outgoing edges attached to the slot.
195 for (const Edge* edge : output_slot_to_edges[src_output_slot]) {
196 // Mark the edge for removal.
197 if (!is_ref) {
198 edges_to_remove.push_back(edge);
199 graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input());
200 }
201
202 // Add control edges from the debug nodes to the destination node
203 // to ensure that the debug nodes are executed before the destination
204 // node. Skip Enter and NextIteration ops to avoid hanging.
205 for (Node* debug_node : debug_nodes) {
206 if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
207 graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
208 Graph::kControlSlot);
209 }
210 }
211 }
212 }
213 }
214
215 // Remove all edges marked for removal.
216 for (const Edge* edge : edges_to_remove) {
217 graph->RemoveEdge(edge);
218 }
219
220 return Status::OK();
221 }
222
DeparallelizeWhileLoops(Graph * graph,Device * device)223 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
224 bool deparallelized_a_loop = false;
225 for (Node* node : graph->nodes()) {
226 if (node->IsEnter()) {
227 const AttrValue* parallel_iterations =
228 node->attrs().Find("parallel_iterations");
229 if (parallel_iterations && parallel_iterations->i() > 1) {
230 deparallelized_a_loop = true;
231 VLOG(1) << "Changing the parallel_iterations attribute of the "
232 << "Enter/RefEnter node \"" << node->name() << "\" on device \""
233 << device->name() << "\" from " << parallel_iterations->i()
234 << " to 1.";
235 node->AddAttr<int64>("parallel_iterations", 1);
236 }
237 }
238 }
239 if (deparallelized_a_loop) {
240 LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations "
241 << "attribute of all scheduled Enter/RefEnter nodes to 1. (This "
242 << "does not affect subsequent non-debug runs.)";
243 }
244 }
245
246 // static
GetCopyNodeName(const string & node_name,const int output_slot)247 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
248 const int output_slot) {
249 // For example, if the watched node is named "node1" and the output slot
250 // is 0, the debug node will be called: __copy_node1_0
251 return strings::StrCat("__copy_", node_name, "_", output_slot);
252 }
253
254 // static
GetDebugNodeName(const string & tensor_name,const int debug_op_num,const string & debug_op_name)255 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
256 const int debug_op_num,
257 const string& debug_op_name) {
258 // For example, if the watched node is named "node1" and the debug op that
259 // watches the output slot of node1 is of the type "DebugNanCount", the
260 // debug node will be called: __dbg_node1_0_0_DebugNanCount.
261 return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_",
262 debug_op_name);
263 }
264
265 // static
CreateCopyNode(Graph * graph,const DeviceType device_type,const bool is_host_memory,const string & src_node_name,const int src_output,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_ops,const std::vector<string> & debug_urls,Node ** copy_node)266 Status DebugNodeInserter::CreateCopyNode(
267 Graph* graph, const DeviceType device_type, const bool is_host_memory,
268 const string& src_node_name, const int src_output, const DataType src_dt,
269 const string& tensor_name, const std::vector<string>& debug_ops,
270 const std::vector<string>& debug_urls, Node** copy_node) {
271 const string kGatedGrpcAttributeKey = "gated_grpc";
272
273 NodeDef node_def;
274 const KernelDef* kdef;
275
276 const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
277 const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
278
279 // Cross debug_ops and debug_urls to get the list of debug ops and watches.
280 std::vector<string> debug_ops_spec;
281 for (const string& debug_op : debug_ops) {
282 for (const string& debug_url : debug_urls) {
283 string debug_op_name_proper;
284 std::unordered_map<string, string> custom_attributes;
285 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
286 &custom_attributes));
287
288 bool gated_grpc_value = false;
289 if (custom_attributes.find(kGatedGrpcAttributeKey) !=
290 custom_attributes.end()) {
291 TF_RETURN_IF_ERROR(ParseBoolString(
292 custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
293 }
294 debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
295 debug_url, ";",
296 gated_grpc_value ? "1" : "0"));
297 }
298 }
299
300 auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
301 .Input(src_node_name, src_output, src_dt)
302 .Attr("debug_ops_spec", debug_ops_spec);
303
304 if (!builder.Finalize(&node_def).ok()) {
305 return Status(
306 error::FAILED_PRECONDITION,
307 strings::StrCat("Failed to create node definition ", "for copy op ",
308 copy_node_name, " on watched tensor ", tensor_name));
309 }
310 Status s = FindKernelDef(device_type, node_def, &kdef, nullptr);
311
312 if (!s.ok()) {
313 return Status(
314 error::FAILED_PRECONDITION,
315 strings::StrCat("Failed to find kernel definition ", "for copy op ",
316 copy_node_name, " on watched tensor ", tensor_name));
317 }
318 if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
319 return Status(error::FAILED_PRECONDITION,
320 strings::StrCat("Failed to create copy node ", copy_node_name,
321 " on watched tensor ", tensor_name));
322 }
323
324 return Status::OK();
325 }
326
327 // static
ParseDebugOpName(const string & debug_op_name,string * debug_op_name_proper,std::unordered_map<string,string> * attributes)328 Status DebugNodeInserter::ParseDebugOpName(
329 const string& debug_op_name, string* debug_op_name_proper,
330 std::unordered_map<string, string>* attributes) {
331 const size_t l_index = debug_op_name.find('(');
332 const size_t r_index = debug_op_name.find(')');
333 if (l_index == string::npos && r_index == string::npos) {
334 *debug_op_name_proper = debug_op_name;
335 } else {
336 if (l_index == string::npos || l_index == 0 ||
337 r_index != debug_op_name.size() - 1) {
338 return errors::InvalidArgument("Malformed debug op name \"",
339 debug_op_name, "\"");
340 }
341
342 *debug_op_name_proper = debug_op_name.substr(0, l_index);
343 string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
344
345 std::vector<string> attribute_segs = str_util::Split(arguments, ";");
346 for (const string& attribute_seg : attribute_segs) {
347 StringPiece seg(attribute_seg);
348 str_util::RemoveWhitespaceContext(&seg);
349 if (seg.empty()) {
350 continue;
351 }
352
353 const size_t eq_index = seg.find('=');
354 if (eq_index == string::npos) {
355 return errors::InvalidArgument(
356 "Malformed attributes in debug op name \"", debug_op_name, "\"");
357 }
358
359 const string key(seg.substr(0, eq_index));
360 const string value(
361 seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
362 if (key.empty() || value.empty()) {
363 return errors::InvalidArgument(
364 "Malformed attributes in debug op name \"", debug_op_name, "\"");
365 }
366
367 if (attributes->find(key) == attributes->end()) {
368 (*attributes)[key] = value;
369 } else {
370 return errors::InvalidArgument("Duplicate attribute name \"", key,
371 "\" found in the debug op: \"",
372 debug_op_name, "\"");
373 }
374 }
375 }
376 return Status::OK();
377 }
378
379 // static
SetDebugNodeAttributes(Node * debug_node,const std::unordered_map<string,string> & attributes)380 Status DebugNodeInserter::SetDebugNodeAttributes(
381 Node* debug_node, const std::unordered_map<string, string>& attributes) {
382 std::unordered_set<string> unfulfilled_keys;
383 for (const auto& item : attributes) {
384 unfulfilled_keys.insert(item.first);
385 }
386
387 for (const auto& attr : debug_node->op_def().attr()) {
388 if (attributes.find(attr.name()) != attributes.end()) {
389 const string& attr_value = attributes.at(attr.name());
390 if (attr.type() == "string") {
391 debug_node->AddAttr<string>(attr.name(), attr_value);
392 } else if (attr.type() == "float") {
393 float float_value = 0.0;
394 if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
395 &float_value)) {
396 return errors::InvalidArgument(
397 "Invalid value string for float-type attribute ", attr.name(),
398 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
399 }
400 debug_node->AddAttr<float>(attr.name(), float_value);
401 } else if (attr.type() == "int") {
402 int64 int_value = 0;
403 if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
404 return errors::InvalidArgument(
405 "Invalid value string for int-type attribute ", attr.name(),
406 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
407 }
408 debug_node->AddAttr<int>(attr.name(), int_value);
409 } else if (attr.type() == "bool") {
410 bool bool_value;
411 if (!ParseBoolString(attr_value, &bool_value).ok()) {
412 return errors::InvalidArgument(
413 "Invalid value string for bool-type attribute ", attr.name(),
414 "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
415 }
416 debug_node->AddAttr<bool>(attr.name(), bool_value);
417 } else {
418 return errors::InvalidArgument(
419 "Unsupported type of custom attribute for debug ops: ",
420 attr.type());
421 }
422
423 unfulfilled_keys.erase(attr.name());
424 }
425 }
426
427 if (unfulfilled_keys.empty()) {
428 return Status::OK();
429 } else {
430 return errors::InvalidArgument(
431 unfulfilled_keys.size(),
432 " attribute key(s) were not valid for debug node ", debug_node->name(),
433 ": ", str_util::Join(unfulfilled_keys, ", "));
434 }
435 }
436
437 // static
CreateDebugNode(Graph * graph,const Device & device,const string & src_copy_node_name,const DataType src_dt,const string & tensor_name,const std::vector<string> & debug_urls,const int debug_op_num,const string & debug_op_name,Node ** debug_node)438 Status DebugNodeInserter::CreateDebugNode(
439 Graph* graph, const Device& device, const string& src_copy_node_name,
440 const DataType src_dt, const string& tensor_name,
441 const std::vector<string>& debug_urls, const int debug_op_num,
442 const string& debug_op_name, Node** debug_node) {
443 NodeDef node_def;
444 const KernelDef* kdef;
445
446 string debug_op_name_proper;
447 std::unordered_map<string, string> custom_attributes;
448 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
449 &custom_attributes));
450
451 const string debug_node_name =
452 GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
453 auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
454 .Input(src_copy_node_name, 0, src_dt)
455 .Attr("device_name", device.name())
456 .Attr("tensor_name", tensor_name)
457 .Attr("debug_urls", debug_urls);
458
459 if (!builder.Finalize(&node_def).ok()) {
460 return errors::FailedPrecondition(
461 "Failed to create node definition for debug op ", debug_op_name_proper,
462 " on watched tensor ", tensor_name);
463 }
464 if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
465 .ok()) {
466 return errors::FailedPrecondition(
467 "Failed to find kernel definition for debug op ", debug_op_name_proper,
468 " on watched tensor ", tensor_name);
469 }
470 if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
471 return errors::FailedPrecondition("Failed to create debug node ",
472 debug_op_name_proper,
473 " on watched tensor ", tensor_name);
474 }
475
476 // Set custom attributes (if any).
477 if (!custom_attributes.empty()) {
478 TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
479 }
480
481 return Status::OK();
482 }
483
484 } // namespace tensorflow
485