1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_H_
17 #define TENSORFLOW_CORE_KERNELS_CAST_OP_H_
18
19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20 #include "tensorflow/core/framework/bfloat16.h"
21 #include "tensorflow/core/framework/op_kernel.h"
22 #include "tensorflow/core/framework/tensor_types.h"
23 #include "tensorflow/core/framework/types.h"
24 #include "tensorflow/core/platform/byte_order.h"
25 #include "tensorflow/core/platform/types.h"
26
27 // Note that the GPU cast functor templates need to be instantiated unlike the
28 // CPU ones, and hence their specializations are different than that for CPUs.
29 #ifdef SPECIALIZE_FOR_GPUS
30 #define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT) \
31 template <typename Device> \
32 struct CastFunctor<Device, OUT_TYPE, IN_OUT> { \
33 void operator()(const Device& d, \
34 typename TTypes<OUT_TYPE>::Flat out_tensor, \
35 typename TTypes<IN_OUT>::ConstFlat in_tensor, \
36 bool truncate = false) { \
37 if (truncate) { \
38 out_tensor.device(d) = \
39 in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>()) \
40 .template cast<OUT_TYPE>(); \
41 } else { \
42 out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
43 } \
44 } \
45 }; \
46 template struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT>;
47 #else
48 #define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT) \
49 template <> \
50 struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT> { \
51 void operator()(const DEVICE& d, \
52 typename TTypes<OUT_TYPE>::Flat out_tensor, \
53 typename TTypes<IN_OUT>::ConstFlat in_tensor, \
54 bool truncate = false) { \
55 if (truncate) { \
56 out_tensor.device(d) = \
57 in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>()) \
58 .template cast<OUT_TYPE>(); \
59 } else { \
60 out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
61 } \
62 } \
63 };
64 #endif
65
66 #define CAST_FUNCTORS(devname) \
67 SPECIALIZE_CAST(devname, float, double) \
68 SPECIALIZE_CAST(devname, float, std::complex<double>) \
69 SPECIALIZE_CAST(devname, std::complex<float>, std::complex<double>) \
70 SPECIALIZE_CAST(devname, std::complex<float>, double) \
71 SPECIALIZE_CAST(devname, Eigen::half, double) \
72 SPECIALIZE_CAST(devname, Eigen::half, float) \
73 SPECIALIZE_CAST(devname, Eigen::half, std::complex<double>) \
74 SPECIALIZE_CAST(devname, Eigen::half, std::complex<float>) \
75 SPECIALIZE_CAST(devname, bfloat16, float) \
76 template <typename OUT_TYPE, typename IN_OUT> \
77 struct CastFunctor<devname, OUT_TYPE, IN_OUT> { \
78 void operator()(const devname& d, \
79 typename TTypes<OUT_TYPE>::Flat out_tensor, \
80 typename TTypes<IN_OUT>::ConstFlat in_tensor, \
81 bool truncate = false) { \
82 out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
83 } \
84 };
85
86 namespace tensorflow {
87
88 typedef std::function<void(OpKernelContext*, const Tensor&, Tensor*,
89 bool trunc)>
90 CastFunctorType;
91
92 // Common base class of Cast kernels
93 class CastOpBase : public OpKernel {
94 public:
95 explicit CastOpBase(OpKernelConstruction* ctx);
96
97 void Compute(OpKernelContext* ctx) override;
98
99 protected:
100 DataType src_dtype_;
101 DataType dst_dtype_;
102 DataType external_src_dtype_;
103 DataType external_dst_dtype_;
104 bool use_truncation_;
105 CastFunctorType work_ = nullptr;
106 Status Unimplemented();
107
108 TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
109 };
110
111 // CPU implementation of Cast
112 class CpuCastOp : public CastOpBase {
113 public:
114 explicit CpuCastOp(OpKernelConstruction* ctx);
115
116 private:
117 Status Prepare();
118 };
119
120 namespace functor {
121
122 template <typename I>
MantissaWidth()123 constexpr int MantissaWidth() {
124 return std::numeric_limits<I>::digits;
125 }
126
127 template <>
128 constexpr int MantissaWidth<Eigen::half>() {
129 // Remember, there's 1 hidden bit
130 return 10 + 1;
131 }
132
133 template <>
134 constexpr int MantissaWidth<bfloat16>() {
135 // Remember, there's 1 hidden bit
136 return 7 + 1;
137 }
138
139 template <typename Device, typename Tout, typename Tin>
Cast(const Device & d,typename TTypes<Tout>::Flat o,typename TTypes<Tin>::ConstFlat i)140 void Cast(const Device& d, typename TTypes<Tout>::Flat o,
141 typename TTypes<Tin>::ConstFlat i) {
142 o.device(d) = i.template cast<Tout>();
143 }
144
145 template <typename Device, typename Tout, typename Tin>
146 struct CastFunctor {
147 void operator()(const Device& d, typename TTypes<Tout>::Flat o,
148 typename TTypes<Tin>::ConstFlat i, bool truncate = false);
149 };
150
151 // Only enable LSBZeroSetterHelper for 64 and 32 bit input data types.
152 // Specialize for others if needed in future.
153 template <typename I>
154 typename std::enable_if<sizeof(I) == 8, void>::type EIGEN_DEVICE_FUNC
LSBZeroSetterHelper(I & t,int n)155 EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
156 // Only zero the bits for non-NaNs.
157 // For NaNs, let the non-truncation version handle it.
158 if (!std::isnan(t)) {
159 uint64_t* p = reinterpret_cast<uint64_t*>(&t);
160 *p &= (0xFFFFFFFFFFFFFFFF << n);
161 }
162 }
163
164 template <typename I>
165 typename std::enable_if<sizeof(I) == 4, void>::type EIGEN_DEVICE_FUNC
LSBZeroSetterHelper(I & t,int n)166 EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
167 // Only zero the bits for non-NaNs.
168 // For NaNs, let the non-truncation version handle it.
169 if (!std::isnan(t)) {
170 uint32_t* p = reinterpret_cast<uint32_t*>(&t);
171 *p &= (0xFFFFFFFF << n);
172 }
173 }
174
175 // Set n least significant bits to 0
176 template <typename I, typename O>
177 struct LSBZeroSetter {
EIGEN_EMPTY_STRUCT_CTORLSBZeroSetter178 EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
179
180 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const I operator()(const I& a) const {
181 constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
182 static_assert(
183 bits > 0,
184 "The output type must have fewer mantissa bits than the input type\n");
185 I t = a;
186 LSBZeroSetterHelper(t, bits);
187 return t;
188 }
189 };
190
191 template <typename I, typename O>
192 struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
193 EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
194
195 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
196 const std::complex<I>& a) const {
197 constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
198 static_assert(
199 bits > 0,
200 "The output type must have fewer mantissa bits than the input type\n");
201 I re = std::real(a);
202 I img = std::imag(a);
203 LSBZeroSetterHelper(re, bits);
204 LSBZeroSetterHelper(img, bits);
205 std::complex<I> toReturn(re, img);
206 return toReturn;
207 }
208 };
209
210 template <typename I, typename O>
211 struct LSBZeroSetter<std::complex<I>, O> {
212 EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
213 // Sets the 16 LSBits of the float to 0
214 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
215 const std::complex<I>& a) const {
216 constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
217 static_assert(
218 bits > 0,
219 "The output type must have fewer mantissa bits than the input type\n");
220 I re = std::real(a);
221 I img = std::imag(a);
222 LSBZeroSetterHelper(re, bits);
223 LSBZeroSetterHelper(img, bits);
224 std::complex<I> toReturn(re, img);
225 return toReturn;
226 }
227 };
228
229 } // end namespace functor
230 } // end namespace tensorflow
231
232 namespace Eigen {
233 namespace internal {
234
235 // Eigen can't convert to/from complex numbers, because it is limited to cases
236 // that can be static_casted. But numpy is able to cast to/from complex, which
237 // we want to replicate. So we add specializations for complex here.
238 template <typename From, typename To>
239 struct scalar_cast_op<std::complex<From>, To> {
240 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE To
241 operator()(const std::complex<From>& a) const {
242 // Replicate numpy behavior of returning just the real part
243 return static_cast<To>(a.real());
244 }
245 };
246
247 template <typename From, typename To>
248 struct scalar_cast_op<From, std::complex<To>> {
249 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
250 const From& a) const {
251 // Replicate numpy behavior of setting the imaginary part to 0
252 return std::complex<To>(static_cast<To>(a), To(0));
253 }
254 };
255
256 template <typename From, typename To>
257 struct scalar_cast_op<std::complex<From>, std::complex<To>> {
258 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
259 const std::complex<From>& a) const {
260 return std::complex<To>(static_cast<To>(a.real()),
261 static_cast<To>(a.imag()));
262 }
263 };
264
265 template <typename From, typename To>
266 struct functor_traits_complex_impl {
267 enum { Cost = NumTraits<To>::AddCost, PacketAccess = false };
268 };
269
270 template <typename From, typename To>
271 struct functor_traits<scalar_cast_op<std::complex<From>, To>>
272 : functor_traits_complex_impl<std::complex<From>, To> {};
273 template <typename From, typename To>
274 struct functor_traits<scalar_cast_op<From, std::complex<To>>>
275 : functor_traits_complex_impl<From, std::complex<To>> {};
276 // Needed to avoid ambiguous partial specialization
277 template <typename From, typename To>
278 struct functor_traits<scalar_cast_op<std::complex<From>, std::complex<To>>>
279 : functor_traits_complex_impl<std::complex<From>, std::complex<To>> {};
280
281 // Specialized cast op impls for bfloat16.
282 template <>
283 struct scalar_cast_op<::tensorflow::bfloat16, float> {
284 EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
285 typedef float result_type;
286 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(
287 const ::tensorflow::bfloat16& a) const {
288 float ret;
289 uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
290 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
291 p[0] = a.value;
292 p[1] = 0;
293 #else
294 static_assert(::tensorflow::port::kLittleEndian,
295 "Not a little endian system!");
296 p[0] = 0;
297 p[1] = a.value;
298 #endif
299 return ret;
300 }
301 };
302
303 template <>
304 struct functor_traits<scalar_cast_op<::tensorflow::bfloat16, float>> {
305 enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
306 };
307
308 template <>
309 struct scalar_cast_op<float, ::tensorflow::bfloat16> {
310 EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
311 typedef ::tensorflow::bfloat16 result_type;
312 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ::tensorflow::bfloat16 operator()(
313 const float a) const {
314 return ::tensorflow::bfloat16(a);
315 }
316 };
317
318 template <>
319 struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16>> {
320 enum { Cost = NumTraits<float>::AddCost, PacketAccess = false };
321 };
322
323 } // namespace internal
324 } // namespace Eigen
325
326 #endif // TENSORFLOW_CORE_KERNELS_CAST_OP_H_
327