1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
17 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
18 
19 #if GOOGLE_CUDA
20 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
21 #endif
22 #ifdef TENSORFLOW_USE_SYCL
23 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
24 #endif  // TENSORFLOW_USE_SYCL
25 #include "tensorflow/core/debug/debug_io_utils.h"
26 #include "tensorflow/core/framework/device_base.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/tensor_util.h"
29 #include "tensorflow/core/lib/core/notification.h"
30 #include "tensorflow/core/lib/strings/stringprintf.h"
31 
32 namespace tensorflow {
33 
34 // Copy op for debugging.
35 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
36 // device on which the tensor is allocated.
37 class CopyOp : public OpKernel {
38  public:
CopyOp(OpKernelConstruction * context)39   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
40     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
41 
42     std::vector<string> debug_ops_spec;
43     OP_REQUIRES_OK(context,
44                    context->GetAttr("debug_ops_spec", &debug_ops_spec));
45     for (const string& debug_op_spec : debug_ops_spec) {
46       // Assume debug_op_spec has the format
47       // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
48       // DebugIdentity;grpc://localhost:3333;1
49       const std::vector<string> items = str_util::Split(debug_op_spec, ";");
50       OP_REQUIRES(
51           context, items.size() == 3,
52           errors::Internal(
53               "Unexpected number of semicolons in debug_ops_spec element: ",
54               debug_op_spec));
55       debug_op_and_url_specs_.push_back(
56           DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
57                                items[1], items[2] == "1"));
58     }
59   }
60 
Compute(OpKernelContext * context)61   void Compute(OpKernelContext* context) override {
62     const Tensor& src_tensor = context->input(0);
63 
64     if (src_tensor.IsInitialized() &&
65         DataTypeCanUseMemcpy(src_tensor.dtype()) &&
66         DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
67       // Source tensor is initialized and is mem-copyable. Make a copy.
68       Tensor* copied_tensor;
69       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
70                                                        &copied_tensor));
71 
72 #if GOOGLE_CUDA
73       Device* device = static_cast<Device*>(context->device());
74       // Determine if the input tensor is not on CPU (e.g., on GPU).
75       bool off_host_input = device->device_type() == DEVICE_GPU &&
76                             !context->input_alloc_attr(0).on_host();
77 
78       if (off_host_input) {
79         DeviceContext* device_ctxt = context->op_device_context();
80         // Input is not on host: deep-copy it from GPU to the same GPU.
81         Notification done_copy;
82         GPUUtil::CopyGPUTensorToSameGPU(
83             device, device_ctxt, &src_tensor, copied_tensor,
84             [&done_copy](const Status& s) { done_copy.Notify(); });
85         done_copy.WaitForNotification();
86       } else {
87         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
88         *copied_tensor = tensor::DeepCopy(src_tensor);
89       }
90 #elif defined(TENSORFLOW_USE_SYCL)
91       Device* device = static_cast<Device*>(context->device());
92       // Determine if the input tensor is not on CPU (e.g., on GPU).
93       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
94                                   !context->input_alloc_attr(0).on_host();
95 
96       if (off_host_input) {
97         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
98       } else {
99         *copied_tensor = tensor::DeepCopy(src_tensor);
100       }
101 #else
102       *copied_tensor = tensor::DeepCopy(src_tensor);
103 #endif
104     } else {
105       // Source tensor is NOT initialized and/or is not mem-copyable: Forward
106       // the Tensor object.
107       context->set_output(0, src_tensor);
108     }
109   }
110 
IsExpensive()111   bool IsExpensive() override { return false; }
112 
113  private:
114   string tensor_name_;
115   std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
116 };
117 
118 // Base class of all debug ops.
119 class BaseDebugOp : public OpKernel {
120  public:
BaseDebugOp(const string & debug_op_name,OpKernelConstruction * context)121   explicit BaseDebugOp(const string& debug_op_name,
122                        OpKernelConstruction* context)
123       : OpKernel(context), debug_op_name_(debug_op_name) {
124     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
125     OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
126 
127     string device_name;
128     string tensor_name;
129     OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
130     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
131 
132     std::vector<string> name_items = str_util::Split(tensor_name, ':');
133     string node_name;
134     int32 output_slot = 0;
135     OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
136                 errors::InvalidArgument("Failed to parse tensor name: \"",
137                                         tensor_name, "\""));
138     if (name_items.size() == 2) {
139       node_name = name_items[0];
140       OP_REQUIRES(
141           context, strings::safe_strto32(name_items[1], &output_slot),
142           errors::InvalidArgument("Invalid string value for output_slot: \"",
143                                   name_items[1], "\""));
144     } else if (name_items.size() == 1) {
145       node_name = name_items[0];
146     }
147 
148     debug_watch_key_.reset(
149         new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
150   }
151 
IsExpensive()152   bool IsExpensive() override { return false; }
153 
154  protected:
155   // Apply gRPC gating (if gated_grpc_ attribute is true).
156   //
157   // Returns false if and only if all grpc:// debug URLs of the debug op are
158   // disabled currently (i.e., gated off), in which case the debug op will emit
159   // an empty (size {0}) tensor of undefined data type.
ApplyGrpcGating(OpKernelContext * context)160   bool ApplyGrpcGating(OpKernelContext* context) {
161     if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
162                            debug_watch_key_->debug_node_name, debug_urls_)) {
163       // The entire node is gated off: Output an empty tensor and avoid
164       // expensive computation.
165       Tensor* output_tensor;
166       TensorShape shape({0});
167       if (!context->allocate_output(0, shape, &output_tensor).ok()) {
168         LOG(ERROR) << "Debug node of watch key "
169                    << debug_watch_key_->debug_node_name
170                    << " failed to allocate empty tensor under gated-off state.";
171       }
172       return false;
173     } else {
174       return true;
175     }
176   }
177 
178   // Publish a tensor to all debug URLs of the debug op.
179   // Log an error if the publishing failed.
PublishTensor(const Tensor & tensor)180   Status PublishTensor(const Tensor& tensor) {
181     if (debug_urls_.empty()) {
182       return Status::OK();
183     } else {
184       Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
185                                                   Env::Default()->NowMicros(),
186                                                   debug_urls_, gated_grpc_);
187       if (!status.ok()) {
188         LOG(ERROR) << "Debug node of watch key "
189                    << debug_watch_key_->debug_node_name
190                    << " failed to publish debug tensor data to all URLs "
191                    << str_util::Join(debug_urls_, ", ")
192                    << ", due to: " << status.error_message();
193       }
194       return status;
195     }
196   }
197 
198  private:
199   const string debug_op_name_;
200   std::unique_ptr<DebugNodeKey> debug_watch_key_;
201   std::vector<string> debug_urls_;
202   bool gated_grpc_;
203 };
204 
205 // Identity op for debugging.
206 //   Output slot 0 carries the debug signal and is always allocated on the
207 //   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
208 //   the debug signal is equal to the input tensor.
209 class DebugIdentityOp : public BaseDebugOp {
210  public:
DebugIdentityOp(OpKernelConstruction * context)211   explicit DebugIdentityOp(OpKernelConstruction* context)
212       : BaseDebugOp("DebugIdentity", context) {}
213 
Compute(OpKernelContext * context)214   void Compute(OpKernelContext* context) override {
215     if (!ApplyGrpcGating(context)) {
216       return;
217     }
218 
219     OP_REQUIRES_OK(context, PublishTensor(context->input(0)));
220     context->set_output(0, context->input(0));
221   }
222 };
223 
224 // NaN-counter op for debugging.
225 template <typename T>
226 class DebugNanCountOp : public BaseDebugOp {
227  public:
DebugNanCountOp(OpKernelConstruction * context)228   explicit DebugNanCountOp(OpKernelConstruction* context)
229       : BaseDebugOp("DebugNanCount", context) {}
230 
Compute(OpKernelContext * context)231   void Compute(OpKernelContext* context) override {
232     if (!ApplyGrpcGating(context)) {
233       return;
234     }
235 
236     Tensor* output_tensor;
237     const Tensor& input = context->input(0);
238 
239     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
240     int64 nan_count = 0;
241 
242     // If the input is an uninitialized tensor, let nan_count be 0.
243     if (input.IsInitialized()) {
244       // Count NaNs.
245       const TensorShape& input_shape = input.shape();
246       const T* input_flat = input.template flat<T>().data();
247 
248       for (int64 i = 0; i < input_shape.num_elements(); ++i) {
249         if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) {
250           nan_count++;
251         }
252       }
253     }
254 
255     TensorShape shape({1});
256     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
257     output_tensor->vec<int64>()(0) = nan_count;
258     OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
259   }
260 };
261 
262 // Numeric summary op for debugging.
263 template <typename T>
264 class DebugNumericSummaryOp : public BaseDebugOp {
265  public:
DebugNumericSummaryOp(OpKernelConstruction * context)266   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
267       : BaseDebugOp("DebugNumericSummary", context) {
268     OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
269     OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
270     OP_REQUIRES_OK(context,
271                    context->GetAttr("mute_if_healthy", &mute_if_healthy_));
272   }
273 
Compute(OpKernelContext * context)274   void Compute(OpKernelContext* context) override {
275     if (!ApplyGrpcGating(context)) {
276       return;
277     }
278 
279     Tensor* output_tensor;
280     const Tensor& input = context->input(0);
281 
282     int64 is_initialized = 0;
283     int64 element_count = 0;
284     int64 negative_inf_count = 0;
285     int64 negative_count = 0;
286     int64 zero_count = 0;
287     int64 positive_count = 0;
288     int64 positive_inf_count = 0;
289     int64 nan_count = 0;
290     double min = std::numeric_limits<double>::infinity();
291     double max = -std::numeric_limits<double>::infinity();
292     double sum = 0.0;
293     double mean = std::numeric_limits<double>::quiet_NaN();
294     double variance = std::numeric_limits<double>::quiet_NaN();
295 
296     // Equal to negative_count + zero_count + positive_count.
297     int64 non_inf_nan_count = 0;
298 
299     const TensorShape& input_shape = input.shape();
300     if (input.IsInitialized()) {
301       is_initialized = 1;
302       const T* input_flat = input.template flat<T>().data();
303 
304       element_count = input_shape.num_elements();
305       const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
306       const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
307 
308       for (int64 i = 0; i < element_count; ++i) {
309         const double x = static_cast<double>(input_flat[i]);
310         if (Eigen::numext::isnan(x)) {
311           nan_count++;
312         } else if (Eigen::numext::isinf(x)) {
313           if (x < 0.0) {
314             negative_inf_count++;
315           } else {
316             positive_inf_count++;
317           }
318         } else {
319           if (is_lower_bound_custom && x <= lower_bound_) {
320             negative_inf_count++;
321           } else if (is_upper_bound_custom && x >= upper_bound_) {
322             positive_inf_count++;
323           } else if (x < 0.0) {
324             negative_count++;
325           } else if (x > 0.0) {
326             positive_count++;
327           } else {
328             zero_count++;
329           }
330 
331           if (x < min) {
332             min = x;
333           }
334           if (x > max) {
335             max = x;
336           }
337 
338           non_inf_nan_count++;
339           sum += x;
340         }
341       }
342 
343       if (non_inf_nan_count > 0) {
344         mean = sum / non_inf_nan_count;
345 
346         // Do a second pass to compute variance.
347         variance = 0.0;
348         for (int64 i = 0; i < element_count; ++i) {
349           const double x = static_cast<double>(input_flat[i]);
350           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
351             variance += (x - mean) * (x - mean);
352           }
353         }
354         variance /= non_inf_nan_count;
355       }
356     }
357 
358     TensorShape shape({14 + input_shape.dims()});
359     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
360     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
361     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
362     output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
363     output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
364     output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
365     output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
366     output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
367     output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
368     output_tensor->vec<double>()(8) = min;
369     output_tensor->vec<double>()(9) = max;
370     output_tensor->vec<double>()(10) = mean;
371     output_tensor->vec<double>()(11) = variance;
372 
373     output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
374     output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
375     for (size_t d = 0; d < input_shape.dims(); ++d) {
376       output_tensor->vec<double>()(14 + d) =
377           static_cast<double>(input_shape.dim_sizes()[d]);
378     }
379 
380     bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
381                 positive_inf_count == 0;
382     if (!mute) {
383       OP_REQUIRES_OK(context, PublishTensor(*output_tensor));
384     }
385   }
386 
387  private:
388   float lower_bound_;
389   float upper_bound_;
390   bool mute_if_healthy_;
391 };
392 
393 }  // namespace tensorflow
394 
395 #endif  // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
396