1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/array_ops.cc.
17 
18 // clang-format off
19 #include "tensorflow/core/platform/bfloat16.h"
20 
21 #include <math.h>  // NOLINT
22 #include <algorithm>  // NOLINT
23 #include <numeric>  // NOLINT
24 // clang-format on
25 
26 #include "tensorflow/core/framework/op_kernel.h"
27 #include "tensorflow/core/framework/register_types.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/tensor_reference.h"
30 #include "tensorflow/core/framework/types.h"
31 
32 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
33 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
34 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
35 
36 #if GOOGLE_CUDA
37 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
38 #elif TENSORFLOW_USE_ROCM
39 #include "tensorflow/core/platform/rocm.h"
40 #endif
41 namespace tensorflow {
42 
43 typedef Eigen::ThreadPoolDevice CPUDevice;
44 typedef Eigen::GpuDevice GPUDevice;
45 
46 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
47 template <typename T>
48 struct CheckNumericsLaunch {
49   void Run(const GPUDevice& d, const T* data, int size,
50            int abnormal_detected[2]);
51 };
52 
53 extern template struct CheckNumericsLaunch<Eigen::half>;
54 extern template struct CheckNumericsLaunch<float>;
55 extern template struct CheckNumericsLaunch<double>;
56 
57 template <typename T>
58 struct CheckNumericsLaunchV2 {
59   void Run(const GPUDevice& d, const T* data, int size,
60            int abnormal_detected[3]);
61 };
62 
63 extern template struct CheckNumericsLaunchV2<Eigen::half>;
64 extern template struct CheckNumericsLaunchV2<float>;
65 extern template struct CheckNumericsLaunchV2<double>;
66 #endif
67 
68 namespace {
69 
70 const int kInfBit = 0x01;
71 const int kNaNBit = 0x02;
72 const int kNegativeInfBit = 0x04;
73 const int kPositiveInfBit = 0x08;
74 
75 template <typename Device, typename T>
76 class CheckNumericsOp;
77 
78 // Partial specialization for CPU
79 // TODO(jeff,rmlarsen): We should make this variant be an AsyncOpKernel, as
80 // was done for the GPU case below.
81 template <typename T>
82 class CheckNumericsOp<CPUDevice, T> : public OpKernel {
83  public:
CheckNumericsOp(OpKernelConstruction * context)84   explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
85     // message_ is used as the prefix for the assertion error message. For
86     // instance, this can be the name of the input op that produced the tensor.
87     OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
88   }
89 
Compute(OpKernelContext * context)90   void Compute(OpKernelContext* context) override {
91     // pass along the input to the output
92     context->set_output(0, context->input(0));
93 
94     auto in = context->input(0).flat<T>();
95     const T* data = in.data();
96     const int64 size = in.size();
97     // Check to see if any element of the tensor is NaN or Inf.
98     int fp_props = std::accumulate(
99         data, data + size, 0,
100         [this](const int x, const T& y) { return checkFloatingElement(x, y); });
101     if (fp_props != 0) {
102       const string& status = getErrorString(fp_props);
103       if (!status.empty()) {
104         context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
105                                                    status, " values"));
106       }
107     }
108   }
109 
110  protected:
checkFloatingElement(const int x,const T & y)111   virtual int checkFloatingElement(const int x, const T& y) {
112     int result = x;
113     if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
114       // Do nothing: common case.
115     } else {
116       if (Eigen::numext::isinf(y)) {
117         result |= kInfBit;
118       } else if (Eigen::numext::isnan(y)) {
119         result |= kNaNBit;
120       }
121     }
122     return result;
123   }
124 
getErrorString(const int fp_props)125   virtual const string getErrorString(const int fp_props) {
126     string status;
127     if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
128       status = "Inf and NaN";
129     } else {
130       if (fp_props & kInfBit) {
131         status = "Inf";
132       }
133       if (fp_props & kNaNBit) {
134         status = "NaN";
135       }
136     }
137     return status;
138   }
139 
140  private:
141   string message_;
142 };
143 
144 template <typename Device, typename T>
145 class CheckNumericsV2Op;
146 
147 // Partial specialization for CPU: v2.
148 // The v2 op differs from the v1 in that it distinguishes -inf and +inf.
149 template <typename T>
150 class CheckNumericsV2Op<CPUDevice, T> : public CheckNumericsOp<CPUDevice, T> {
151  public:
CheckNumericsV2Op(OpKernelConstruction * context)152   explicit CheckNumericsV2Op(OpKernelConstruction* context)
153       : CheckNumericsOp<CPUDevice, T>(context) {}
154 
155  protected:
checkFloatingElement(const int x,const T & y)156   int checkFloatingElement(const int x, const T& y) override {
157     int result = x;
158     if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) {
159       // Do nothing: common case.
160     } else {
161       if (Eigen::numext::isinf(y)) {
162         result |= y < static_cast<T>(0.) ? kNegativeInfBit : kPositiveInfBit;
163       } else if (Eigen::numext::isnan(y)) {
164         result |= kNaNBit;
165       }
166     }
167     return result;
168   }
169 
getErrorString(const int fp_props)170   const string getErrorString(const int fp_props) override {
171     std::vector<string> anomalies;
172     if (fp_props & kNegativeInfBit) {
173       anomalies.push_back("-Inf");
174     }
175     if (fp_props & kPositiveInfBit) {
176       anomalies.push_back("+Inf");
177     }
178     if (fp_props & kNaNBit) {
179       anomalies.push_back("NaN");
180     }
181     if (anomalies.size() == 3) {
182       return strings::StrCat(anomalies[0], ", ", anomalies[1], ", and ",
183                              anomalies[2]);
184     } else if (anomalies.size() == 2) {
185       return strings::StrCat(anomalies[0], " and ", anomalies[1]);
186     } else {
187       return anomalies[0];
188     }
189   }
190 };
191 
192 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
193 // Partial specialization for GPU
194 template <typename T>
195 class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
196  public:
197   typedef GPUDevice Device;
198 
CheckNumericsOp(OpKernelConstruction * context)199   explicit CheckNumericsOp(OpKernelConstruction* context)
200       : AsyncOpKernel(context) {
201     // message_ is used as the prefix for the assertion error message. For
202     // instance, this can be the name of the input op that produced the tensor.
203     OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
204   }
205 
ComputeAsync(OpKernelContext * context,DoneCallback done)206   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
207     // pass along the input to the output
208     context->set_output(0, context->input(0));
209     if (context->input(0).NumElements() == 0) {
210       done();
211       return;
212     }
213     auto input = context->input(0).flat<T>();
214 
215     // Allocate and initialize the elements to hold the check results
216     Tensor abnormal_detected;
217     const int abnormal_detected_size = getAnomalyIndicatorSize();
218     OP_REQUIRES_OK(context, context->allocate_temp(
219                                 DT_INT32, TensorShape({abnormal_detected_size}),
220                                 &abnormal_detected));
221 
222     auto* stream = context->op_device_context()->stream();
223     OP_REQUIRES_ASYNC(context, stream != nullptr,
224                       errors::Internal("No GPU stream available."), done);
225 
226     se::DeviceMemoryBase abnormal_detected_ptr(
227         abnormal_detected.flat<int>().data(),
228         abnormal_detected.flat<int>().size());
229     stream->ThenMemset32(&abnormal_detected_ptr, 0,
230                          abnormal_detected.flat<int>().size() * sizeof(int));
231 
232     // Call the GPU kernels for the numerical checks
233     const Device& d = context->eigen_device<Device>();
234     RunKernel(d, input.data(), input.size(),
235               abnormal_detected.flat<int>().data());
236 
237     // Copy the results from device to host
238     AllocatorAttributes attr;
239     attr.set_on_host(true);
240     attr.set_gpu_compatible(true);
241     Tensor abnormal_detected_host;
242     OP_REQUIRES_OK_ASYNC(
243         context,
244         context->allocate_temp(DT_INT32, TensorShape({abnormal_detected_size}),
245                                &abnormal_detected_host, attr),
246         done);
247     OP_REQUIRES_ASYNC(
248         context,
249         stream
250             ->ThenMemcpy(abnormal_detected_host.flat<int>().data(),
251                          abnormal_detected_ptr,
252                          abnormal_detected_size * sizeof(int))
253             .ok(),
254         errors::Internal("GPU memcpy from device to host failed"), done);
255 
256     // We have observed crashes on some network stacks when not holding
257     // this tensor reference.
258     TensorReference abnormal_detected_ref(abnormal_detected);
259     auto check_cb = [this, stream, abnormal_detected_ref,
260                      abnormal_detected_host, context, done]() {
261 #if GOOGLE_CUDA
262       se::cuda::ScopedActivateExecutorContext scoped_activation{
263           stream->parent()};
264 #elif TENSORFLOW_USE_ROCM
265       se::rocm::ScopedActivateExecutorContext scoped_activation{
266           stream->parent()};
267 #endif
268       TTypes<const int>::Vec abnormal_detected_host_flat =
269           abnormal_detected_host.flat<int>();
270       abnormal_detected_ref.Unref();
271       checkForAnomalies(context, abnormal_detected_host_flat);
272       done();
273     };
274     context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
275         stream, std::move(check_cb));
276   }
277 
278  protected:
getAnomalyIndicatorSize()279   virtual int getAnomalyIndicatorSize() { return 2; }
280 
RunKernel(const GPUDevice & d,const T * data,int size,int * abnormal_detected)281   virtual void RunKernel(const GPUDevice& d, const T* data, int size,
282                          int* abnormal_detected) {
283     CheckNumericsLaunch<T>().Run(d, data, size, abnormal_detected);
284   }
285 
checkForAnomalies(OpKernelContext * context,const TTypes<const int>::Vec & abnormality_indicators)286   virtual void checkForAnomalies(
287       OpKernelContext* context,
288       const TTypes<const int>::Vec& abnormality_indicators) {
289     const int is_nan = abnormality_indicators(0);
290     const int is_inf = abnormality_indicators(1);
291     if (is_nan || is_inf) {
292       LOG(ERROR) << "abnormal_detected_host @" << abnormality_indicators.data()
293                  << " = {" << is_nan << ", " << is_inf << "} " << message_;
294 
295       string anomalies;
296       if (is_nan && is_inf) {
297         anomalies = "Inf and NaN";
298       } else if (is_nan) {
299         anomalies = "NaN";
300       } else if (is_inf) {
301         anomalies = "Inf";
302       }
303       context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
304                                                  anomalies, " values"));
305     }
306   }
307 
308   string message_;
309 };
310 
311 template <typename T>
312 class CheckNumericsV2Op<GPUDevice, T> : public CheckNumericsOp<GPUDevice, T> {
313  public:
CheckNumericsV2Op(OpKernelConstruction * context)314   CheckNumericsV2Op(OpKernelConstruction* context)
315       : CheckNumericsOp<GPUDevice, T>(context) {}
316 
317  protected:
getAnomalyIndicatorSize()318   int getAnomalyIndicatorSize() override { return 3; }
319 
RunKernel(const GPUDevice & d,const T * data,int size,int * abnormal_detected)320   void RunKernel(const GPUDevice& d, const T* data, int size,
321                  int* abnormal_detected) override {
322     CheckNumericsLaunchV2<T>().Run(d, data, size, abnormal_detected);
323   }
324 
checkForAnomalies(OpKernelContext * context,const TTypes<const int>::Vec & abnormality_indicators)325   void checkForAnomalies(
326       OpKernelContext* context,
327       const TTypes<const int>::Vec& abnormality_indicators) override {
328     const int is_nan = abnormality_indicators(0);
329     const int is_negative_inf = abnormality_indicators(1);
330     const int is_positive_inf = abnormality_indicators(2);
331     if (is_negative_inf || is_positive_inf || is_nan) {
332       std::vector<string> anomalies;
333       if (is_negative_inf) {
334         anomalies.push_back("-Inf");
335       }
336       if (is_positive_inf) {
337         anomalies.push_back("+Inf");
338       }
339       if (is_nan) {
340         anomalies.push_back("NaN");
341       }
342       string all_anomalies;
343       if (anomalies.size() == 3) {
344         all_anomalies = strings::StrCat(anomalies[0], ", ", anomalies[1],
345                                         ", and ", anomalies[2]);
346       } else if (anomalies.size() == 2) {
347         all_anomalies = strings::StrCat(anomalies[0], " and ", anomalies[1]);
348       } else {
349         all_anomalies = anomalies[0];
350       }
351       context->SetStatus(errors::InvalidArgument(
352           this->message_, " : Tensor had ", all_anomalies, " values"));
353     }
354   }
355 
356   static constexpr int abnormal_detected_size = 3;
357 };
358 
359 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
360 
361 }  // namespace
362 
363 #define REGISTER_CPU_KERNEL(T)                                         \
364   REGISTER_KERNEL_BUILDER(                                             \
365       Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
366       CheckNumericsOp<CPUDevice, T>);
367 TF_CALL_half(REGISTER_CPU_KERNEL);
368 TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
369 TF_CALL_float(REGISTER_CPU_KERNEL);
370 TF_CALL_double(REGISTER_CPU_KERNEL);
371 
372 #define REGISTER_V2_CPU_KERNEL(T)                                        \
373   REGISTER_KERNEL_BUILDER(                                               \
374       Name("CheckNumericsV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
375       CheckNumericsV2Op<CPUDevice, T>);
376 TF_CALL_half(REGISTER_V2_CPU_KERNEL);
377 TF_CALL_bfloat16(REGISTER_V2_CPU_KERNEL);
378 TF_CALL_float(REGISTER_V2_CPU_KERNEL);
379 TF_CALL_double(REGISTER_V2_CPU_KERNEL);
380 
381 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
382 REGISTER_KERNEL_BUILDER(
383     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
384     CheckNumericsOp<GPUDevice, Eigen::half>);
385 REGISTER_KERNEL_BUILDER(
386     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<float>("T"),
387     CheckNumericsOp<GPUDevice, float>);
388 REGISTER_KERNEL_BUILDER(
389     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<double>("T"),
390     CheckNumericsOp<GPUDevice, double>);
391 
392 REGISTER_KERNEL_BUILDER(
393     Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
394     CheckNumericsV2Op<GPUDevice, Eigen::half>);
395 REGISTER_KERNEL_BUILDER(
396     Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<float>("T"),
397     CheckNumericsV2Op<GPUDevice, float>);
398 REGISTER_KERNEL_BUILDER(
399     Name("CheckNumericsV2").Device(DEVICE_GPU).TypeConstraint<double>("T"),
400     CheckNumericsV2Op<GPUDevice, double>);
401 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
402 
403 }  // namespace tensorflow
404