1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #define EIGEN_USE_THREADS
16 
17 #include "tensorflow/core/kernels/pooling_ops_3d.h"
18 
19 #include <array>
20 
21 #include "third_party/eigen3/Eigen/Core"
22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_slice.h"
29 #include "tensorflow/core/kernels/eigen_pooling.h"
30 #include "tensorflow/core/kernels/ops_util.h"
31 #include "tensorflow/core/lib/core/errors.h"
32 #include "tensorflow/core/util/padding.h"
33 #include "tensorflow/core/util/tensor_format.h"
34 #include "tensorflow/core/util/work_sharder.h"
35 
36 #if GOOGLE_CUDA
37 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
38 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
39 #endif
40 
41 #ifdef TENSORFLOW_USE_SYCL
42 #include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
43 #endif  // TENSORFLOW_USE_SYCL
44 
45 namespace tensorflow {
46 
47 typedef Eigen::ThreadPoolDevice CPUDevice;
48 typedef Eigen::GpuDevice GPUDevice;
49 #ifdef TENSORFLOW_USE_SYCL
50 typedef Eigen::SyclDevice SYCLDevice;
51 #endif  // TENSORFLOW_USE_SYCL
52 
Pool3dParameters(OpKernelContext * context,const std::vector<int32> & ksize,const std::vector<int32> & stride,Padding padding,TensorFormat data_format,const TensorShape & tensor_in_shape)53 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
54                                    const std::vector<int32>& ksize,
55                                    const std::vector<int32>& stride,
56                                    Padding padding, TensorFormat data_format,
57                                    const TensorShape& tensor_in_shape) {
58   // For maxpooling, tensor_in should have 4 dimensions.
59   OP_REQUIRES(context, tensor_in_shape.dims() == 5,
60               errors::InvalidArgument("tensor_in must be 4-dimensional"));
61 
62   this->data_format = data_format;
63   depth = GetTensorDim(tensor_in_shape, data_format, 'C');
64   tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
65   tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
66   tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
67   tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
68   window_planes = GetTensorDim(ksize, data_format, '0');
69   window_rows = GetTensorDim(ksize, data_format, '1');
70   window_cols = GetTensorDim(ksize, data_format, '2');
71   depth_window = GetTensorDim(ksize, data_format, 'C');
72   plane_stride = GetTensorDim(stride, data_format, '0');
73   row_stride = GetTensorDim(stride, data_format, '1');
74   col_stride = GetTensorDim(stride, data_format, '2');
75   depth_stride = GetTensorDim(stride, data_format, 'C');
76 
77   // We only support 3D pooling across plane/width/height. Depthwise
78   // pooling is not supported.
79   OP_REQUIRES(
80       context, depth_window == 1 && depth_stride == 1,
81       errors::Unimplemented(
82           "Pooling3d only supports pooling across plane/width/height."));
83 
84   OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
85                                                 plane_stride, padding,
86                                                 &out_plane, &pad_planes));
87   OP_REQUIRES_OK(context,
88                  GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
89                                        padding, &out_height, &pad_rows));
90   OP_REQUIRES_OK(context,
91                  GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
92                                        padding, &out_width, &pad_cols));
93 }
94 
forward_output_shape()95 TensorShape Pool3dParameters::forward_output_shape() {
96   return ShapeFromFormat(data_format, tensor_in_batch,
97                          {{out_plane, out_height, out_width}}, depth);
98 }
99 
100 template <typename T>
101 struct LaunchPoolingOp<CPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp102   static void launch(OpKernelContext* context, const Tensor& tensor_in,
103                      const std::array<int64, 3>& window,
104                      const std::array<int64, 3>& stride,
105                      const std::array<int64, 3>& padding,
106                      TensorFormat data_format, Padding padding_type,
107                      Tensor* output) {
108     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
109         Eigen::CuboidAvgPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
110                                 window[2], stride[0], stride[1], stride[2],
111                                 BrainPadding2EigenPadding(padding_type));
112   }
113 };
114 
115 template <typename T>
116 struct LaunchPoolingOp<CPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp117   static void launch(OpKernelContext* context, const Tensor& tensor_in,
118                      const std::array<int64, 3>& window,
119                      const std::array<int64, 3>& stride,
120                      const std::array<int64, 3>& padding,
121                      TensorFormat data_format, Padding padding_type,
122                      Tensor* output) {
123     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
124         Eigen::CuboidMaxPooling(tensor_in.tensor<T, 5>(), window[0], window[1],
125                                 window[2], stride[0], stride[1], stride[2],
126                                 BrainPadding2EigenPadding(padding_type));
127   }
128 };
129 
130 template <typename Device, typename T, PoolingType Type>
131 class Pooling3DOp : public UnaryOp<T> {
132  public:
Pooling3DOp(OpKernelConstruction * context)133   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
134     string data_format;
135     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
136     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
137                 errors::InvalidArgument("Invalid data format"));
138     if (context->device_type() == DEVICE_CPU) {
139       OP_REQUIRES(
140           context, data_format_ == FORMAT_NHWC,
141           errors::InvalidArgument("Default Pooling3DOp only supports NDHWC ",
142                                   "on device type ",
143                                   DeviceTypeString(context->device_type())));
144     }
145     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
146     OP_REQUIRES(context, ksize_.size() == 5,
147                 errors::InvalidArgument("Sliding window ksize field must "
148                                         "specify 5 dimensions"));
149     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
150     OP_REQUIRES(context, stride_.size() == 5,
151                 errors::InvalidArgument("Sliding window stride field must "
152                                         "specify 5 dimensions"));
153     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
154     OP_REQUIRES(context,
155                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
156                  GetTensorDim(stride_, data_format_, 'N') == 1),
157                 errors::Unimplemented(
158                     "Pooling is not yet supported on the batch dimension."));
159     OP_REQUIRES(context,
160                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
161                  GetTensorDim(stride_, data_format_, 'C') == 1),
162                 errors::Unimplemented(
163                     "Pooling is not yet supported on the depth dimension."));
164   }
165 
Compute(OpKernelContext * context)166   void Compute(OpKernelContext* context) override {
167     const Tensor& tensor_in = context->input(0);
168 
169     OP_REQUIRES(context, tensor_in.dims() == 5,
170                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
171     const int64 depth = GetTensorDim(tensor_in, data_format_, 'C');
172     const int64 in_batch = GetTensorDim(tensor_in, data_format_, 'N');
173 
174     // Dimension order for these arrays is: x, y, z.
175     std::array<int64, 3> input_size{
176         {GetTensorDim(tensor_in, data_format_, '2'),
177          GetTensorDim(tensor_in, data_format_, '1'),
178          GetTensorDim(tensor_in, data_format_, '0')}};
179     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
180                                  GetTensorDim(ksize_, data_format_, '1'),
181                                  GetTensorDim(ksize_, data_format_, '0')}};
182     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
183                                  GetTensorDim(stride_, data_format_, '1'),
184                                  GetTensorDim(stride_, data_format_, '0')}};
185     std::array<int64, 3> padding, out;
186 
187     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
188                                             padding_, &out, &padding));
189 
190     TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
191                                             {{out[2], out[1], out[0]}}, depth);
192     Tensor* output;
193     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
194     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
195                                              padding, data_format_, padding_,
196                                              output);
197   }
198 
199  private:
200   std::vector<int32> ksize_;
201   std::vector<int32> stride_;
202   Padding padding_;
203   TensorFormat data_format_;
204 };
205 
206 template <typename T>
207 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp208   static void launch(OpKernelContext* context, const Tensor& tensor_in,
209                      const Tensor& tensor_out, const Tensor& out_backprop,
210                      const std::array<int64, 3>& window,
211                      const std::array<int64, 3>& stride,
212                      const std::array<int64, 3>& out,
213                      const std::array<int64, 3>& padding,
214                      TensorFormat data_format, Tensor* output) {
215     output->flat<T>().setZero();
216     for (int64 p = 0; p < out_backprop.dim_size(3); ++p) {
217       // Calculate broadcast size for planes/rows/cols. For SAME padding,
218       // current index could be in the padding area, and
219       //   p * stride_planes + window_planes
220       // could be beyond the input tensor's boundary. In such cases, change
221       // the starting index and reduce the broadcast size.
222       //
223       // The same procedure is repeated for every spatial dimension in the
224       // nested loops below.
225       int pindex, psize;
226       std::array<int64, 3> input_size{{tensor_in.dim_size(3),
227                                        tensor_in.dim_size(2),
228                                        tensor_in.dim_size(1)}};
229       OP_REQUIRES_OK(context,
230                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
231                                       padding[0], &pindex, &psize));
232       for (int64 r = 0; r < out_backprop.dim_size(2); ++r) {
233         int rindex, rsize;
234         OP_REQUIRES_OK(context,
235                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
236                                         padding[1], &rindex, &rsize));
237         for (int64 c = 0; c < out_backprop.dim_size(1); ++c) {
238           int cindex, csize;
239           OP_REQUIRES_OK(
240               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
241                                         padding[2], &cindex, &csize));
242           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
243           TensorSlice dst{{0, -1},
244                           {cindex, csize},
245                           {rindex, rsize},
246                           {pindex, psize},
247                           {0, -1}};
248           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
249           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
250           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
251           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
252           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
253                                      &src_sizes);
254           dst.FillIndicesAndSizes<5>(tensor_in.shape(), &dst_indices,
255                                      &dst_sizes);
256 
257 #if !defined(EIGEN_HAS_INDEX_LIST)
258           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
259 #else
260           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
261                            Eigen::type2index<1>>
262               bcast;
263           bcast.set(1, csize);
264           bcast.set(2, rsize);
265           bcast.set(3, psize);
266 #endif
267 
268           // Slice from tensor_in.
269           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_in_slice(dst_sizes);
270           tensor_in_slice.device(context->eigen_cpu_device()) =
271               tensor_in.tensor<T, 5>().slice(dst_indices, dst_sizes);
272 
273           // Slice from tensor_out.
274           Eigen::Tensor<T, 5, Eigen::RowMajor> tensor_out_slice(src_sizes);
275           tensor_out_slice.device(context->eigen_cpu_device()) =
276               tensor_out.tensor<T, 5>().slice(src_indices, src_sizes);
277 
278           // Backprop slice.
279           Eigen::Tensor<T, 5, Eigen::RowMajor> out_backprop_slice(src_sizes);
280           out_backprop_slice.device(context->eigen_cpu_device()) =
281               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
282 
283           // The true backprop slice: if an element is the max, choose
284           // the backprop slice; otherwise set to 0.
285           Eigen::Tensor<T, 5, Eigen::RowMajor> select_slice(dst_sizes);
286           Eigen::Tensor<T, 5, Eigen::RowMajor> mat0(dst_sizes);
287           mat0.setZero();
288           select_slice =
289               ((tensor_in_slice - tensor_out_slice.broadcast(bcast)).abs() <
290                tensor_in_slice.constant(1e-5))
291                   .select(out_backprop_slice.broadcast(bcast), mat0);
292 
293           output->tensor<T, 5>()
294               .slice(dst_indices, dst_sizes)
295               .device(context->eigen_cpu_device()) += select_slice;
296         }
297       }
298     }
299   }
300 };
301 
302 template <class Device, class T>
303 class MaxPooling3dGradOp : public OpKernel {
304  public:
MaxPooling3dGradOp(OpKernelConstruction * context)305   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
306       : OpKernel(context) {
307     string data_format;
308     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
309     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
310                 errors::InvalidArgument("Invalid data format"));
311     if (context->device_type() == DEVICE_CPU) {
312       OP_REQUIRES(
313           context, data_format_ == FORMAT_NHWC,
314           errors::InvalidArgument(
315               "Default MaxPooling3dGradOp only supports NDHWC ",
316               "on device type ", DeviceTypeString(context->device_type())));
317     }
318     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
319     OP_REQUIRES(context, ksize_.size() == 5,
320                 errors::InvalidArgument("Sliding window ksize field must "
321                                         "specify 5 dimensions"));
322     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
323     OP_REQUIRES(context, stride_.size() == 5,
324                 errors::InvalidArgument("Sliding window stride field must "
325                                         "specify 5 dimensions"));
326     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
327     OP_REQUIRES(context,
328                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
329                  GetTensorDim(stride_, data_format_, 'N') == 1),
330                 errors::Unimplemented(
331                     "Pooling is not yet supported on the batch dimension."));
332     OP_REQUIRES(context,
333                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
334                  GetTensorDim(stride_, data_format_, 'C') == 1),
335                 errors::Unimplemented(
336                     "Pooling is not yet supported on the depth dimension."));
337   }
338 
Compute(OpKernelContext * context)339   void Compute(OpKernelContext* context) override {
340     const Tensor& tensor_in = context->input(0);
341     const Tensor& tensor_out = context->input(1);
342     const Tensor& out_backprop = context->input(2);
343     OP_REQUIRES(context, tensor_in.dims() == 5,
344                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
345     OP_REQUIRES(context, tensor_out.dims() == 5,
346                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
347     OP_REQUIRES(context, out_backprop.dims() == 5,
348                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
349 
350     const TensorShape& output_shape = tensor_in.shape();
351     Tensor* input_backprop;
352     OP_REQUIRES_OK(context,
353                    context->allocate_output(0, output_shape, &input_backprop));
354     std::array<int64, 3> input_size{
355         {GetTensorDim(output_shape, data_format_, '2'),
356          GetTensorDim(output_shape, data_format_, '1'),
357          GetTensorDim(output_shape, data_format_, '0')}};
358     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
359                                  GetTensorDim(ksize_, data_format_, '1'),
360                                  GetTensorDim(ksize_, data_format_, '0')}};
361     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
362                                  GetTensorDim(stride_, data_format_, '1'),
363                                  GetTensorDim(stride_, data_format_, '0')}};
364     std::array<int64, 3> out, padding;
365 
366     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
367                                             padding_, &out, &padding));
368     LaunchMaxPooling3dGradOp<Device, T>::launch(
369         context, tensor_in, tensor_out, out_backprop, window, stride, out,
370         padding, data_format_, input_backprop);
371   }
372 
373  private:
374   std::vector<int32> ksize_;
375   std::vector<int32> stride_;
376   Padding padding_;
377   TensorFormat data_format_;
378 };
379 
380 template <typename T>
381 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp382   static void launch(OpKernelContext* context,
383                      const TensorShape& tensor_in_shape,
384                      const Tensor& out_backprop,
385                      const std::array<int64, 3>& window,
386                      const std::array<int64, 3>& stride,
387                      const std::array<int64, 3>& output_shape,
388                      const std::array<int64, 3>& padding,
389                      TensorFormat data_format, Tensor* output) {
390     output->flat<T>().setZero();
391     std::array<int64, 3> input_size = {{tensor_in_shape.dim_size(3),
392                                         tensor_in_shape.dim_size(2),
393                                         tensor_in_shape.dim_size(1)}};
394     for (int64 p = 0; p < out_backprop.dim_size(3); ++p) {
395       // Calculate broadcast size for planes/rows/cols. For SAME padding,
396       // current index could be in the padding area, and
397       //   p * stride_planes + window_planes
398       // could be beyond the input tensor's boundary. In such cases, change
399       // the starting index and reduce the broadcast size.
400       //
401       // The same procedure is repeated for every spatial dimension in the
402       // nested loops below.
403       int pindex, psize;
404       OP_REQUIRES_OK(context,
405                      GetBroadcastSize(p, input_size[0], window[0], stride[0],
406                                       padding[0], &pindex, &psize));
407       for (int64 r = 0; r < out_backprop.dim_size(2); ++r) {
408         int rindex, rsize;
409         OP_REQUIRES_OK(context,
410                        GetBroadcastSize(r, input_size[1], window[1], stride[1],
411                                         padding[1], &rindex, &rsize));
412         for (int64 c = 0; c < out_backprop.dim_size(1); ++c) {
413           int cindex, csize;
414           OP_REQUIRES_OK(
415               context, GetBroadcastSize(c, input_size[2], window[2], stride[2],
416                                         padding[2], &cindex, &csize));
417           TensorSlice src{{0, -1}, {c, 1}, {r, 1}, {p, 1}, {0, -1}};
418           TensorSlice dst{{0, -1},
419                           {cindex, csize},
420                           {rindex, rsize},
421                           {pindex, psize},
422                           {0, -1}};
423           Eigen::DSizes<Eigen::DenseIndex, 5> src_indices;
424           Eigen::DSizes<Eigen::DenseIndex, 5> src_sizes;
425           Eigen::DSizes<Eigen::DenseIndex, 5> dst_indices;
426           Eigen::DSizes<Eigen::DenseIndex, 5> dst_sizes;
427           src.FillIndicesAndSizes<5>(out_backprop.shape(), &src_indices,
428                                      &src_sizes);
429           dst.FillIndicesAndSizes<5>(tensor_in_shape, &dst_indices, &dst_sizes);
430 #if !defined(EIGEN_HAS_INDEX_LIST)
431           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
432 #else
433           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
434                            Eigen::type2index<1>>
435               bcast;
436           bcast.set(1, csize);
437           bcast.set(2, rsize);
438           bcast.set(3, psize);
439 #endif
440           Eigen::Tensor<T, 5, Eigen::RowMajor> slices(src_sizes);
441           slices.device(context->eigen_cpu_device()) =
442               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
443           // Divide by the size of the actual patch (psize * rsize * csize).
444           float divide_size = rsize * csize * psize * 1.0f;
445           slices *= slices.constant(1.0f / divide_size);
446 
447           output->tensor<T, 5>()
448               .slice(dst_indices, dst_sizes)
449               .device(context->eigen_cpu_device()) += slices.broadcast(bcast);
450         }
451       }
452     }
453   }
454 };
455 
456 template <class Device, class T>
457 class AvgPooling3dGradOp : public OpKernel {
458  public:
AvgPooling3dGradOp(OpKernelConstruction * context)459   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
460       : OpKernel(context) {
461     string data_format;
462     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
463     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
464                 errors::InvalidArgument("Invalid data format"));
465     if (context->device_type() == DEVICE_CPU) {
466       OP_REQUIRES(
467           context, data_format_ == FORMAT_NHWC,
468           errors::InvalidArgument(
469               "Default AvgPooling3dGradOp only supports NDHWC ",
470               "on device type ", DeviceTypeString(context->device_type())));
471     }
472     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
473     OP_REQUIRES(context, ksize_.size() == 5,
474                 errors::InvalidArgument("Sliding window ksize field must "
475                                         "specify 5 dimensions"));
476     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
477     OP_REQUIRES(context, stride_.size() == 5,
478                 errors::InvalidArgument("Sliding window stride field must "
479                                         "specify 5 dimensions"));
480     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
481     OP_REQUIRES(context,
482                 (GetTensorDim(ksize_, data_format_, 'N') == 1 &&
483                  GetTensorDim(stride_, data_format_, 'N') == 1),
484                 errors::Unimplemented(
485                     "Pooling is not yet supported on the batch dimension."));
486     OP_REQUIRES(context,
487                 (GetTensorDim(ksize_, data_format_, 'C') == 1 &&
488                  GetTensorDim(stride_, data_format_, 'C') == 1),
489                 errors::Unimplemented(
490                     "Pooling is not yet supported on the depth dimension."));
491   }
492 
Compute(OpKernelContext * context)493   void Compute(OpKernelContext* context) override {
494     const Tensor& tensor_in_shape = context->input(0);
495     const Tensor& out_backprop = context->input(1);
496     OP_REQUIRES(
497         context,
498         tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 5,
499         errors::InvalidArgument("tensor_in must be 1-dimensional and 5 "
500                                 "elements"));
501     OP_REQUIRES(context, out_backprop.dims() == 5,
502                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
503 
504     TensorShape output_shape;
505     auto shape_vec = tensor_in_shape.vec<int32>();
506     for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
507       output_shape.AddDim(shape_vec(i));
508     }
509 
510     Tensor* output;
511     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
512 
513     // Dimension order for these arrays is x, y, z.
514     std::array<int64, 3> input_size{
515         {GetTensorDim(output_shape, data_format_, '2'),
516          GetTensorDim(output_shape, data_format_, '1'),
517          GetTensorDim(output_shape, data_format_, '0')}};
518     std::array<int64, 3> window{{GetTensorDim(ksize_, data_format_, '2'),
519                                  GetTensorDim(ksize_, data_format_, '1'),
520                                  GetTensorDim(ksize_, data_format_, '0')}};
521     std::array<int64, 3> stride{{GetTensorDim(stride_, data_format_, '2'),
522                                  GetTensorDim(stride_, data_format_, '1'),
523                                  GetTensorDim(stride_, data_format_, '0')}};
524     std::array<int64, 3> padding, out;
525 
526     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
527                                             padding_, &out, &padding));
528 
529     LaunchAvgPooling3dGradOp<Device, T>::launch(
530         context, output_shape, out_backprop, window, stride, out, padding,
531         data_format_, output);
532   }
533 
534  private:
535   std::vector<int32> ksize_;
536   std::vector<int32> stride_;
537   Padding padding_;
538   TensorFormat data_format_;
539 };
540 
541 template <typename T>
542 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp543   static void launch(OpKernelContext* context, const Pool3dParameters& params,
544                      const Tensor& tensor_in, const Tensor& tensor_out,
545                      const Tensor& tensor_top_diff,
546                      Tensor* tensor_bottom_diff) {
547     OP_REQUIRES(
548         context, params.data_format == FORMAT_NHWC,
549         errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
550                                 "NDHWC on CPU device type"));
551 
552     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
553         ConstEigenMatrixMap;
554     typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
555         EigenMatrixMap;
556 
557     ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
558                                params.tensor_in_planes * params.tensor_in_cols *
559                                    params.tensor_in_rows *
560                                    params.tensor_in_batch);
561     ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
562                                 params.out_plane * params.out_width *
563                                     params.out_height * params.tensor_in_batch);
564     ConstEigenMatrixMap top_diff_mat(
565         tensor_top_diff.flat<T>().data(), params.depth,
566         params.tensor_in_planes * params.tensor_in_cols *
567             params.tensor_in_rows * params.tensor_in_batch);
568     EigenMatrixMap bottom_diff_mat(
569         tensor_bottom_diff->flat<T>().data(), params.depth,
570         params.out_plane * params.out_width * params.out_height *
571             params.tensor_in_batch);
572 
573     const DeviceBase::CpuWorkerThreads& worker_threads =
574         *(context->device()->tensorflow_cpu_worker_threads());
575 
576     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
577                      int64 start, int64 limit) {
578       const int32 depth = params.depth;
579       const int32 in_planes = params.tensor_in_planes;
580       const int32 in_rows = params.tensor_in_rows;
581       const int32 in_cols = params.tensor_in_cols;
582       const int32 pad_planes = params.pad_planes;
583       const int32 pad_rows = params.pad_rows;
584       const int32 pad_cols = params.pad_cols;
585       const int32 window_planes = params.window_planes;
586       const int32 window_rows = params.window_rows;
587       const int32 window_cols = params.window_cols;
588       const int32 plane_stride = params.plane_stride;
589       const int32 row_stride = params.row_stride;
590       const int32 col_stride = params.col_stride;
591       const int32 out_plane = params.out_plane;
592       const int32 out_height = params.out_height;
593       const int32 out_width = params.out_width;
594 
595       {
596         // Initializes the output grad backprop tensor with 0.
597         const int32 output_image_size =
598             out_plane * out_height * out_width * params.depth;
599         EigenMatrixMap bottom_diff_shard(
600             bottom_diff_mat.data() + start * output_image_size, 1,
601             (limit - start) * output_image_size);
602         bottom_diff_shard.setZero();
603       }
604 
605       for (int b = start; b < limit; ++b) {
606         for (int pp = 0; pp < out_plane; ++pp) {
607           for (int ph = 0; ph < out_height; ++ph) {
608             for (int pw = 0; pw < out_width; ++pw) {
609               // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
610               // range that the input vector projects to.
611               int p_start = pp * plane_stride - pad_planes;
612               const int p_end = std::min(p_start + window_planes, in_planes);
613               int h_start = ph * row_stride - pad_rows;
614               const int h_end = std::min(h_start + window_rows, in_rows);
615               int w_start = pw * col_stride - pad_cols;
616               const int w_end = std::min(w_start + window_cols, in_cols);
617               p_start = std::max(p_start, 0);
618               h_start = std::max(h_start, 0);
619               w_start = std::max(w_start, 0);
620               const int out_index =
621                   ((b * out_plane + pp) * out_height + ph) * out_width + pw;
622               // Find value corresponding to the input maximum in top_diff.
623               for (int d = 0; d < depth; ++d) {
624                 const T& output_ref = out_mat.coeffRef(d, out_index);
625                 bool should_stop = false;
626                 for (int p = p_start; p < p_end && !should_stop; ++p) {
627                   for (int h = h_start; h < h_end && !should_stop; ++h) {
628                     for (int w = w_start; w < w_end && !should_stop; ++w) {
629                       const int in_index =
630                           ((b * in_planes + p) * in_rows + h) * in_cols + w;
631                       const T& input_ref = in_mat.coeffRef(d, in_index);
632                       if (output_ref == input_ref) {
633                         T& bottom_diff_ref =
634                             bottom_diff_mat.coeffRef(d, out_index);
635                         bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
636                         should_stop = true;
637                       }
638                     }
639                   }
640                 }
641               }
642             }
643           }
644         }
645       }
646     };
647     const int64 shard_cost =
648         params.out_plane * params.out_height * params.out_width * params.depth *
649         params.window_planes * params.window_rows * params.window_cols;
650     Shard(worker_threads.num_threads, worker_threads.workers,
651           params.tensor_in_batch, shard_cost, shard);
652   }
653 };
654 
655 template <class Device, class T>
656 class MaxPooling3dGradGradOp : public OpKernel {
657  public:
MaxPooling3dGradGradOp(OpKernelConstruction * context)658   explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
659       : OpKernel(context) {
660     string data_format;
661     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
662     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
663                 errors::InvalidArgument("Invalid data format"));
664     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
665     OP_REQUIRES(context, ksize_.size() == 5,
666                 errors::InvalidArgument("Sliding window ksize field must "
667                                         "specify 5 dimensions"));
668     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
669     OP_REQUIRES(context, stride_.size() == 5,
670                 errors::InvalidArgument("Sliding window strides field must "
671                                         "specify 5 dimensions"));
672     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
673     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
674                 errors::Unimplemented(
675                     "Pooling is not yet supported on the batch dimension."));
676     const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
677     const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
678     OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
679                 errors::Unimplemented("MaxPooling3dGradGrad is not yet "
680                                       "supported on the depth dimension."));
681   }
682 
Compute(OpKernelContext * context)683   void Compute(OpKernelContext* context) override {
684     const Tensor& tensor_in = context->input(0);
685     const Tensor& tensor_out = context->input(1);
686     const Tensor& out_grad_backprop = context->input(2);
687 
688     // For maxpooling3d, tensor_in should have 5 dimensions.
689     OP_REQUIRES(context, tensor_in.dims() == 5,
690                 errors::InvalidArgument("tensor_in must be 5-dimensional"));
691     OP_REQUIRES(context, tensor_out.dims() == 5,
692                 errors::InvalidArgument("tensor_out must be 5-dimensional"));
693     // For maxpooling3d, out_grad_backprop should have 5 dimensions.
694     OP_REQUIRES(
695         context, out_grad_backprop.dims() == 5,
696         errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
697 
698     Pool3dParameters params{context,  ksize_,       stride_,
699                             padding_, data_format_, tensor_in.shape()};
700 
701     Tensor* output = nullptr;
702     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
703                                 {2}, 0, tensor_out.shape(), &output));
704 
705     LaunchMaxPooling3dGradGradOp<Device, T>::launch(
706         context, params, tensor_in, tensor_out, out_grad_backprop, output);
707   }
708 
709  private:
710   std::vector<int32> ksize_;
711   std::vector<int32> stride_;
712   Padding padding_;
713   TensorFormat data_format_;
714 };
715 
716 #define REGISTER_KERNELS(D, T)                                             \
717   REGISTER_KERNEL_BUILDER(                                                 \
718       Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
719       Pooling3DOp<D##Device, T, MAX>);                                     \
720   REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
721                               .Device(DEVICE_##D)                          \
722                               .TypeConstraint<T>("T")                      \
723                               .TypeConstraint<T>("TInput"),                \
724                           MaxPooling3dGradOp<D##Device, T>);               \
725   REGISTER_KERNEL_BUILDER(                                                 \
726       Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
727       MaxPooling3dGradGradOp<D##Device, T>);                               \
728   REGISTER_KERNEL_BUILDER(                                                 \
729       Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
730       Pooling3DOp<D##Device, T, AVG>);                                     \
731   REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
732                               .Device(DEVICE_##D)                          \
733                               .TypeConstraint<T>("T")                      \
734                               .HostMemory("orig_input_shape"),             \
735                           AvgPooling3dGradOp<D##Device, T>);
736 
737 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
738 TF_CALL_float(REGISTER_CPU_KERNELS);
739 #undef REGISTER_CPU_KERNELS
740 
741 #if GOOGLE_CUDA
742 
743 template <typename T>
744 struct LaunchPoolingOp<GPUDevice, T, AVG> {
launchtensorflow::LaunchPoolingOp745   static void launch(OpKernelContext* context, const Tensor& tensor_in,
746                      const std::array<int64, 3>& window,
747                      const std::array<int64, 3>& stride,
748                      const std::array<int64, 3>& padding,
749                      TensorFormat data_format, Padding padding_type,
750                      Tensor* output) {
751     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
752                                stride, padding, data_format, tensor_in, output);
753   }
754 };
755 
756 template <typename T>
757 struct LaunchPoolingOp<GPUDevice, T, MAX> {
launchtensorflow::LaunchPoolingOp758   static void launch(OpKernelContext* context, const Tensor& tensor_in,
759                      const std::array<int64, 3>& window,
760                      const std::array<int64, 3>& stride,
761                      const std::array<int64, 3>& padding,
762                      TensorFormat data_format, Padding padding_type,
763                      Tensor* output) {
764     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
765                                stride, padding, data_format, tensor_in, output);
766   }
767 };
768 
769 template <typename T>
770 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradOp771   static void launch(OpKernelContext* context, const Tensor& tensor_in,
772                      const Tensor& tensor_out, const Tensor& out_backprop,
773                      const std::array<int64, 3>& window,
774                      const std::array<int64, 3>& stride,
775                      const std::array<int64, 3>& out,
776                      const std::array<int64, 3>& padding,
777                      TensorFormat data_format, Tensor* input_backprop) {
778     const TensorShape output_shape = tensor_in.shape();
779     DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
780                                    window, stride, padding, out, data_format,
781                                    out_backprop, output_shape, &tensor_in,
782                                    &tensor_out, input_backprop);
783   }
784 };
785 
786 template <typename T>
787 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
launchtensorflow::LaunchAvgPooling3dGradOp788   static void launch(OpKernelContext* context,
789                      const TensorShape& tensor_in_shape,
790                      const Tensor& out_backprop,
791                      const std::array<int64, 3>& window,
792                      const std::array<int64, 3>& stride,
793                      const std::array<int64, 3>& out,
794                      const std::array<int64, 3>& padding,
795                      TensorFormat data_format, Tensor* output) {
796     DnnPooling3dGradOp<T>::Compute(
797         context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
798         data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
799   }
800 };
801 
802 template <typename T>
803 struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
launchtensorflow::LaunchMaxPooling3dGradGradOp804   static void launch(OpKernelContext* context, const Pool3dParameters& params,
805                      const Tensor& tensor_in, const Tensor& tensor_out,
806                      const Tensor& tensor_top_diff,
807                      Tensor* tensor_bottom_diff) {
808     bool status = functor::MaxPool3dGradBackward<T>()(
809         params.data_format, tensor_in.flat<T>().data(),
810         tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
811         params.out_height, params.out_width, params.depth,
812         params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
813         params.window_planes, params.window_rows, params.window_cols,
814         params.plane_stride, params.row_stride, params.col_stride,
815         params.pad_planes, params.pad_rows, params.pad_cols,
816         tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
817         context->eigen_gpu_device());
818     if (!status) {
819       context->SetStatus(
820           errors::Internal("Failed launching MaxPool3dGradBackward"));
821     }
822   }
823 };
824 
825 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
826 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
827 #undef REGISTER_GPU_KERNELS
828 
829 #endif  // GOOGLE_CUDA
830 
831 #ifdef TENSORFLOW_USE_SYCL
832 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
833     TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
834 #undef REGISTER_SYCL_KERNELS
835 #endif  // TENSORFLOW_USE_SYCL
836 
837 #undef REGISTER_KERNELS
838 
839 }  // namespace tensorflow
840