1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_2D_H_
18 
19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20 #include "tensorflow/core/framework/tensor_types.h"
21 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
22 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
23 #include "tensorflow/core/util/tensor_format.h"
24 
25 namespace tensorflow {
26 namespace functor {
27 
28 template <typename Device, typename Input, typename Filter, typename Output,
29           typename OutputKernel>
30 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
31                             Filter filter, int row_stride, int col_stride,
32                             int row_dilation, int col_dilation,
33                             const Eigen::PaddingType& padding,
34                             const OutputKernel& output_kernel,
35                             int padding_top = 0, int padding_bottom = 0,
36                             int padding_left = 0, int padding_right = 0) {
37   // Need to swap row/col, padding_top/padding_left, and
38   // padding_bottom/padding_right when calling Eigen. Eigen expects the tensor
39   // in NWHC format, but the tensor given is in NHWC.
40   output.device(d) = Eigen::SpatialConvolution(
41       input, filter, col_stride, row_stride, padding, col_dilation,
42       row_dilation, output_kernel, padding_left, padding_right, padding_top,
43       padding_bottom);
44 }
45 
46 template <typename Device, typename T,
47           typename OutputKernel = const Eigen::NoOpOutputKernel>
48 struct SpatialConvolution {
operatorSpatialConvolution49   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
50                   typename TTypes<T, 4>::ConstTensor input,
51                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
52                   int col_stride, int row_dilation, int col_dilation,
53                   const Eigen::PaddingType& padding,
54                   const OutputKernel& output_kernel = OutputKernel()) {
55     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
56                            row_dilation, col_dilation, padding, output_kernel);
57   }
operatorSpatialConvolution58   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
59                   typename TTypes<T, 4>::ConstTensor input,
60                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
61                   int col_stride, int row_dilation, int col_dilation,
62                   int padding_top, int padding_bottom, int padding_left,
63                   int padding_right,
64                   const OutputKernel& output_kernel = OutputKernel()) {
65     SpatialConvolutionFunc(
66         d, output, input, filter, row_stride, col_stride, row_dilation,
67         col_dilation, Eigen::PaddingType::PADDING_VALID, output_kernel,
68         padding_top, padding_bottom, padding_left, padding_right);
69   }
70 };
71 
72 template <typename Device, typename OutputKernel>
73 struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
74   void operator()(const Device& d,
75                   typename TTypes<Eigen::half, 4>::Tensor output,
76                   typename TTypes<Eigen::half, 4>::ConstTensor input,
77                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
78                   int row_stride, int col_stride, int row_dilation,
79                   int col_dilation, const Eigen::PaddingType& padding,
80                   const OutputKernel& output_kernel = OutputKernel()) {
81     output.device(d) =
82         Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
83                                   col_stride, row_stride, padding, col_dilation,
84                                   row_dilation, output_kernel)
85             .template cast<Eigen::half>();
86   }
87   void operator()(const Device& d,
88                   typename TTypes<Eigen::half, 4>::Tensor output,
89                   typename TTypes<Eigen::half, 4>::ConstTensor input,
90                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
91                   int row_stride, int col_stride, int row_dilation,
92                   int col_dilation, int padding_top, int padding_bottom,
93                   int padding_left, int padding_right,
94                   const OutputKernel& output_kernel = OutputKernel()) {
95     output.device(d) =
96         Eigen::SpatialConvolution(
97             input.cast<float>(), filter.cast<float>(), col_stride, row_stride,
98             Eigen::PaddingType::PADDING_VALID, col_dilation, row_dilation,
99             output_kernel, padding_left, padding_right, padding_top,
100             padding_bottom)
101             .template cast<Eigen::half>();
102   }
103 };
104 
105 template <typename Device, typename T>
106 struct SpatialConvolutionBackwardInputFunc {
107   void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
108                   typename TTypes<T, 4>::ConstTensor filter,
109                   typename TTypes<T, 4>::ConstTensor output_backward,
110                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
111                   Eigen::DenseIndex col_dilation,
112                   Eigen::DenseIndex row_dilation) {
113     input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
114         filter, output_backward, input_backward.dimension(2),
115         input_backward.dimension(1), col_stride, row_stride, col_dilation,
116         row_dilation);
117   }
118 };
119 
120 // GPU version requires all tensors to be indexable by int32.
121 template <typename T>
122 struct SpatialConvolutionBackwardInputFunc<Eigen::GpuDevice, T> {
123   void operator()(const Eigen::GpuDevice& d,
124                   typename TTypes<T, 4>::Tensor input_backward,
125                   typename TTypes<T, 4>::ConstTensor filter,
126                   typename TTypes<T, 4>::ConstTensor output_backward,
127                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
128                   Eigen::DenseIndex col_dilation,
129                   Eigen::DenseIndex row_dilation) {
130     To32Bit(input_backward).device(d) = Eigen::SpatialConvolutionBackwardInput(
131         To32Bit(filter), To32Bit(output_backward), input_backward.dimension(2),
132         input_backward.dimension(1), col_stride, row_stride, col_dilation,
133         row_dilation);
134   }
135 };
136 
137 template <typename Device, typename T>
138 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc {
139   void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
140                   typename TTypes<T, 4>::ConstTensor filter,
141                   typename TTypes<T, 4>::ConstTensor output_backward,
142                   Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
143                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
144                   Eigen::DenseIndex col_dilation,
145                   Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
146                   Eigen::DenseIndex pad_top) {
147     // We have to slice the result of a spatial convolution backward
148     // input, before assigning it to the `input_backward` to remove padding.
149     //
150     // TODO(ezhulenev): Pass explicit paddings to Eigen and do not materialize
151     // intermediate result in memory before slicing.
152     input_backward.device(d) =
153         Eigen::SpatialConvolutionBackwardInput(
154             filter, output_backward, padded_cols, padded_rows, col_stride,
155             row_stride, col_dilation, row_dilation)
156             .eval()
157             .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
158                    input_backward.dimensions());
159   }
160 };
161 
162 // GPU version requires all tensors to be indexable by int32.
163 template <typename T>
164 struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<Eigen::GpuDevice,
165                                                               T> {
166   void operator()(const Eigen::GpuDevice& d,
167                   typename TTypes<T, 4>::Tensor input_backward,
168                   typename TTypes<T, 4>::ConstTensor filter,
169                   typename TTypes<T, 4>::ConstTensor output_backward,
170                   Eigen::DenseIndex padded_cols, Eigen::DenseIndex padded_rows,
171                   Eigen::DenseIndex col_stride, Eigen::DenseIndex row_stride,
172                   Eigen::DenseIndex col_dilation,
173                   Eigen::DenseIndex row_dilation, Eigen::DenseIndex pad_left,
174                   Eigen::DenseIndex pad_top) {
175     To32Bit(input_backward).device(d) =
176         Eigen::SpatialConvolutionBackwardInput(
177             To32Bit(filter), To32Bit(output_backward), padded_cols, padded_rows,
178             col_stride, row_stride, col_dilation, row_dilation)
179             .eval()
180             .slice(Eigen::DSizes<Eigen::DenseIndex, 4>{0, pad_left, pad_top, 0},
181                    input_backward.dimensions());
182   }
183 };
184 
185 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
186 // My initial attempt to do this compiled but failed in the pytest
187 // due to a swigdeps error.
188 template <typename Device, typename T,
189           typename OutputKernel = const Eigen::NoOpOutputKernel>
190 struct MatMulConvFunctor {
191   // Computes on device "d": out = in0 * in1, where * is matrix
192   // multiplication.
193   void operator()(
194       const Device& d, typename TTypes<T, 2>::Tensor out,
195       typename TTypes<T, 2>::ConstTensor in0,
196       typename TTypes<T, 2>::ConstTensor in1,
197       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
198       const OutputKernel& output_kernel = OutputKernel()) {
199     out.device(d) = in0.contract(in1, dim_pair, output_kernel);
200   }
201 };
202 
203 // Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
204 //
205 // Note: Currently supports OIHW and OHWI destination formats.
206 template <typename Device, typename T, typename IndexType, int NDIMS>
207 struct TransformFilter {
208   void operator()(const Device& d, FilterTensorFormat dst_filter_format,
209                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
210                   typename TTypes<T, NDIMS, IndexType>::Tensor out) {
211     // NOTE: Source filter format is always HWIO.
212     Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
213     for (int i = 0; i < spatial_dims.rank(); ++i) {
214       spatial_dims[i] = in.dimension(i);
215     }
216 
217     // Merge the spatial dimensions together to speed up the shuffle operation.
218     Eigen::DSizes<IndexType, 3> merged_dims;
219     merged_dims[0] = spatial_dims.TotalSize();  // product of spatial dims [H*W]
220     merged_dims[1] = in.dimension(NDIMS - 2);   // input filters           [I]
221     merged_dims[2] = in.dimension(NDIMS - 1);   // output filters          [O]
222 
223     // Shuffle tensor with merged spatial dimensions.
224     Eigen::DSizes<IndexType, 3> shuffling_perm;
225     // Expand shuffled tensor into final dimensions.
226     Eigen::DSizes<IndexType, NDIMS> expanded_dims;
227 
228     if (dst_filter_format == FORMAT_OIHW) {
229       shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
230 
231       expanded_dims[0] = merged_dims[2];  // [O]
232       expanded_dims[1] = merged_dims[1];  // [I]
233       for (int i = 0; i < spatial_dims.rank(); ++i) {
234         expanded_dims[2 + i] = spatial_dims[i];
235       }
236 
237     } else if (dst_filter_format == FORMAT_OHWI) {
238       shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
239 
240       expanded_dims[0] = merged_dims[2];          // [O]
241       expanded_dims[NDIMS - 1] = merged_dims[1];  // [I]
242       for (int i = 0; i < spatial_dims.rank(); ++i) {
243         expanded_dims[1 + i] = spatial_dims[i];
244       }
245 
246     } else {
247       DCHECK(false) << "Unsupported destination filter format: "
248                     << ToString(dst_filter_format);
249     }
250 
251     out.device(d) =
252         in.reshape(merged_dims).shuffle(shuffling_perm).reshape(expanded_dims);
253   }
254 };
255 
256 // TODO This functor is not used anywhere and should be removed,
257 // but it defines some eigen templates that are referenced in other kernels.
258 template <typename Device, typename T, typename IndexType>
259 struct TransformDepth {
260   void operator()(const Device& d,
261                   typename TTypes<T, 4, IndexType>::ConstTensor in,
262                   const Eigen::DSizes<IndexType, 4>& shuffle,
263                   typename TTypes<T, 4, IndexType>::Tensor out) {
264     Eigen::DSizes<IndexType, 3> merged_dims;
265     Eigen::DSizes<IndexType, 4> expanded_dims;
266     Eigen::DSizes<IndexType, 3> new_shuffle;
267 
268     // Merge dimensions that won't be shuffled together to speed things up.
269     if (shuffle[1] == 2 && shuffle[2] == 3) {
270       merged_dims[0] = in.dimension(0);
271       merged_dims[1] = in.dimension(1);
272       merged_dims[2] = in.dimension(2) * in.dimension(3);
273       new_shuffle[0] = shuffle[0];
274       new_shuffle[1] = 2;
275       new_shuffle[2] = shuffle[3];
276       expanded_dims[0] = in.dimension(shuffle[0]);
277       expanded_dims[1] = in.dimension(2);
278       expanded_dims[2] = in.dimension(3);
279       expanded_dims[3] = in.dimension(shuffle[3]);
280     } else if (shuffle[0] == 2 && shuffle[1] == 3) {
281       merged_dims[0] = in.dimension(0);
282       merged_dims[1] = in.dimension(1);
283       merged_dims[2] = in.dimension(2) * in.dimension(3);
284       new_shuffle[0] = 2;
285       new_shuffle[1] = shuffle[2];
286       new_shuffle[2] = shuffle[3];
287       expanded_dims[0] = in.dimension(2);
288       expanded_dims[1] = in.dimension(3);
289       expanded_dims[2] = in.dimension(shuffle[2]);
290       expanded_dims[3] = in.dimension(shuffle[3]);
291     } else if (shuffle[0] == 0 && shuffle[1] == 3 && shuffle[2] == 1 &&
292                shuffle[3] == 2) {
293       merged_dims[0] = in.dimension(0);
294       merged_dims[1] = in.dimension(1) * in.dimension(2);
295       merged_dims[2] = in.dimension(3);
296       new_shuffle[0] = 0;
297       new_shuffle[1] = 2;
298       new_shuffle[2] = 1;
299       expanded_dims[0] = in.dimension(0);
300       expanded_dims[1] = in.dimension(3);
301       expanded_dims[2] = in.dimension(1);
302       expanded_dims[3] = in.dimension(2);
303     } else {
304       assert(false && "unexpected shuffle");
305     }
306 
307     out.device(d) =
308         in.reshape(merged_dims).shuffle(new_shuffle).reshape(expanded_dims);
309   }
310 };
311 
312 // Note on the use of const reference for the "padding_value" argument
313 //
314 // In the ROCm TF build,
315 // ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
316 //    by the "CPU" compiler, while the
317 // ++ the GPUDevice specific template instantiations are in the files that are
318 //     compiled by the "GPU" compiler.
319 //
320 // For T == Eigen::half, the value of the "padding_value" argument (when it was
321 // pass-by-value) was getting corrupted, leading to regressions in the
322 // convolution unit tests.
323 //
324 // I do not understand the exact reason for the this, but based on similar past
325 // issues, it is likely due to a combination of
326 // ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
327 //    Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
328 //    (hipclang which is based on latest clang), AND
329 // ++ Eigen::half having the same size but different internals on the CPU and
330 //    GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
331 //
332 // Changing the "padding value" argument to be a const reference type seems to
333 // suppress the bug
334 template <typename Device, typename T, typename IndexType, int NDIMS>
335 struct PadInput {
336   void operator()(const Device& d,
337                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
338                   const std::array<int, NDIMS - 2>& padding_left,
339                   const std::array<int, NDIMS - 2>& padding_right,
340                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
341                   TensorFormat format, const T& padding_value) {
342     Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
343     padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
344     for (int i = 0; i < NDIMS - 2; ++i) {
345       padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] = {
346           padding_left[i], padding_right[i]};
347     }
348     padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
349     out.device(d) = in.pad(padding, padding_value);
350   }
351 };
352 
353 // Converts a tensor from:
354 //   [batch, <spatial>, filters]
355 // to:
356 //   [batch, filters, <spatial>]
357 template <typename Device, typename T, int NDIMS>
358 struct NHWCToNCHW {
359   void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
360                   typename TTypes<T, NDIMS>::Tensor out);
361 };
362 
363 // Converts a tensor from:
364 //   [batch, filters, <spatial>]
365 // to:
366 //   [batch, <spatial>, filters]
367 template <typename Device, typename T, int NDIMS>
368 struct NCHWToNHWC {
369   void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
370                   typename TTypes<T, NDIMS>::Tensor out);
371 };
372 
373 // Converts a tensor from:
374 //   [dim0, dim1, dim2]
375 // to:
376 //   [dim0, dim2, dim1]
377 template <typename Device, typename T, bool conjugate = false>
378 struct SwapDimension1And2InTensor3 {
379   void operator()(const Device& d, const T* in,
380                   const gtl::ArraySlice<int64>& input_dims, T* out);
381 };
382 
383 // Converts a tensor from:
384 //   [dim0, dim1, dim2]
385 // to:
386 //   [dim2, dim1, dim0]
387 template <typename Device, typename T, bool conjugate = false>
388 struct SwapDimension0And2InTensor3 {
389   void operator()(const Device& d, const T* in,
390                   const gtl::ArraySlice<int64>& input_dims, T* out);
391 };
392 
393 // Transforms back filter from OIHW or OHWI to HWOI format to reverse effect of
394 // TransformFilter above.
395 template <typename Device, typename T, int NDIMS>
396 struct ReverseTransformFilter {
397   void operator()(const Device& d, FilterTensorFormat src_filter_format,
398                   typename TTypes<T, NDIMS>::ConstTensor in,
399                   typename TTypes<T, NDIMS>::Tensor out);
400 };
401 
402 }  // namespace functor
403 
404 template <class T>
405 class ConvAlgorithmMap;
406 
407 template <>
408 class ConvAlgorithmMap<Eigen::ThreadPoolDevice> {};
409 }  // namespace tensorflow
410 
411 #endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_H_
412