1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <type_traits>
21 
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/conv_ops.h"
31 #include "tensorflow/core/kernels/depthwise_conv_op.h"
32 #include "tensorflow/core/kernels/ops_util.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 #include "tensorflow/core/util/tensor_format.h"
38 #include "tensorflow/core/util/use_cudnn.h"
39 #include "tensorflow/core/util/work_sharder.h"
40 
41 #if GOOGLE_CUDA
42 #include "cuda/include/cudnn.h"
43 #include "tensorflow/core/platform/stream_executor.h"
44 #endif  // GOOGLE_CUDA
45 
46 namespace tensorflow {
47 
48 // In depthwise convolution, one input is convolved into depth_multipler
49 // outputs and the outputs don't need to be reduced again like what regular
50 // convolution does.
51 //  However, the way to apply filters to inputs is exactly the same as the
52 // regular convolution. Please refer to the regular convolution kernels for
53 // more details.
54 
55 typedef Eigen::ThreadPoolDevice CPUDevice;
56 typedef Eigen::GpuDevice GPUDevice;
57 
58 // Computes the vectorized product of 'input_buffer' and 'filter' and stores
59 // result in 'output' at location specified by 'out_r' and 'out_c'.
60 //
61 // EX:
62 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
63 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
64 //
65 //   input_buffer [rows, cols, in_depth, depth_multiplier]
66 //     [a0, a0, a1, a1] [a2, a2, 0, 0] [b0, b0, b1, b1] [b2, b2, 0, 0]
67 //     [e0, e0, e1, e1] [e2, e2, 0, 0] [f0, f0, f1, f1] [f2, f2, 0, 0]
68 //
69 //   filter [rows, cols, in_depth, depth_multiplier]
70 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
71 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
72 //
73 //   First output register [in_depth, depth_multiplier]
74 //     [q0, q1, q2, q3] = ([a0, a0, a1, a1] x [u0, v0, w0, x0]) +
75 //                        ([b0, b0, b1, b1] x [u1, v1, w1, x1]) +
76 //                        ([e0, e0, e1, e1] x [u2, v2, w2, x2]) +
77 //                        ([f0, f0, f1, f1] x [u3, v3, w3, x3])
78 //
79 // TODO(andydavis) Experiment with processing multiple inputs per input buffer.
80 template <typename T>
81 struct DepthwiseConv2DKernel {
Runtensorflow::DepthwiseConv2DKernel82   static void Run(const DepthwiseArgs& args,
83                   const int64 padded_filter_inner_dim_size, const int64 out_r,
84                   const int64 out_c, const T* filter, const T* input_buffer,
85                   T* output, TensorFormat data_format) {
86     typedef typename Eigen::internal::packet_traits<T>::type Packet;
87     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
88 
89     const int64 out_depth = args.out_depth;
90     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
91     const int64 output_scalar_size = out_depth % kPacketSize;
92     const int64 output_vectorized_size =
93         (out_depth / kPacketSize) * kPacketSize;
94     const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
95 
96     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
97       // Reset accumulator.
98       auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
99       for (int j = 0; j < filter_spatial_size; ++j) {
100         // Calculate index.
101         const int64 index = i + j * padded_filter_inner_dim_size;
102         // Load filter.
103         // TODO(andydavis) Unroll 'out_c' loop in caller so we can load
104         // multiple inputs here to amortize the cost of each filter block load.
105         const auto filter_block =
106             Eigen::internal::ploadu<Packet>(filter + index);
107         // Load input.
108         const auto data_block =
109             Eigen::internal::ploadu<Packet>(input_buffer + index);
110         // Vector multiply-add.
111         vaccum =
112             Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
113       }
114       // Store vector accumulator to output.
115       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
116     }
117 
118     if (output_scalar_size > 0) {
119       auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
120       for (int j = 0; j < filter_spatial_size; ++j) {
121         const int64 index =
122             output_vectorized_size + j * padded_filter_inner_dim_size;
123         const auto filter_block =
124             Eigen::internal::ploadu<Packet>(filter + index);
125         const auto data_block =
126             Eigen::internal::ploadu<Packet>(input_buffer + index);
127         vaccum =
128             Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
129       }
130       // Load accumulator into an array and loop through output.
131       T out_buf[kPacketSize];
132       Eigen::internal::pstoreu<T>(out_buf, vaccum);
133       const int64 last_output_index =
134           base_output_index + output_vectorized_size;
135       for (int j = 0; j < output_scalar_size; ++j) {
136         output[last_output_index + j] = out_buf[j];
137       }
138     }
139   }
140 };
141 
142 // Computes the depthwise conv2d of 'input' by 'depthwise_filter' and stores
143 // the result in 'output'. This implementation trades off copying small patches
144 // of the input to achieve better data alignment, which enables vectorized
145 // load/store and multiply-add operations (see comments at InputBufferCopyOp and
146 // DepthwiseConv2DKernel for details).
147 //
148 // TODO(andydavis) Evaluate the performance of processing multiple input
149 // patches in the inner loop.
150 // TODO(andydavis) Consider a zero-copy implementation for the case when
151 // 'in_depth' is a multiple of register width, and 'depth_multipler' is one.
152 // TODO(andydavis) Evaluate the performance of alternative implementations.
153 template <typename T>
154 struct LaunchDepthwiseConvOp<CPUDevice, T> {
155   typedef typename Eigen::internal::packet_traits<T>::type Packet;
156 
operator ()tensorflow::LaunchDepthwiseConvOp157   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
158                   const T* input, const T* depthwise_filter, T* output,
159                   TensorFormat data_format) {
160     OP_REQUIRES(
161         ctx, data_format == FORMAT_NHWC,
162         errors::Unimplemented(
163             "Depthwise convolution on CPU is only supported for NHWC format"));
164     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
165 
166     // Pad 'depthwise_filter' to vector register width (if needed).
167     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
168     Tensor padded_filter;
169     if (pad_filter) {
170       // Allocate space for padded filter.
171       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
172       const int64 padded_filter_inner_dim_size =
173           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
174       OP_REQUIRES_OK(
175           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
176                                   TensorShape({filter_spatial_size,
177                                                padded_filter_inner_dim_size}),
178                                   &padded_filter));
179       // Write out padded filter.
180       functor::DepthwiseFilterPadOp<T>()(
181           args, depthwise_filter, padded_filter.template flat<T>().data());
182     }
183     const T* filter_data =
184         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
185 
186     // Computes one shard of depthwise conv2d output.
187     auto shard = [&ctx, &args, &input, &filter_data, &output, data_format](
188                      int64 start, int64 limit) {
189       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
190       const int64 input_image_size =
191           args.in_rows * args.in_cols * args.in_depth;
192       const int64 output_image_size =
193           args.out_rows * args.out_cols * args.out_depth;
194       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
195       const int64 padded_filter_inner_dim_size =
196           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
197 
198       // Allocate buffer for local input regions.
199       Tensor input_buffer;
200       OP_REQUIRES_OK(
201           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
202                                   TensorShape({filter_spatial_size,
203                                                padded_filter_inner_dim_size}),
204                                   &input_buffer));
205       T* input_buffer_data = input_buffer.template flat<T>().data();
206 
207       for (int64 i = start; i < limit; ++i) {
208         const int64 b = i / args.out_rows;
209         const int64 in_base = b * input_image_size;
210         const int64 out_base = b * output_image_size;
211 
212         const int64 out_r = i % args.out_rows;
213 
214         for (int64 out_c = 0; out_c < args.out_cols; ++out_c) {
215           // Populate 'input_buffer_data' with data from local input region.
216           functor::DepthwiseInputCopyOp<T>()(args, padded_filter_inner_dim_size,
217                                              out_r, out_c, input + in_base,
218                                              input_buffer_data);
219 
220           // Process buffered input across all filters and store to output.
221           DepthwiseConv2DKernel<T>::Run(
222               args, padded_filter_inner_dim_size, out_r, out_c, filter_data,
223               input_buffer_data, output + out_base, data_format);
224         }
225       }
226     };
227 
228     const int64 total_shards = args.batch * args.out_rows;
229 
230     // Empirically tested to give reasonable performance boosts at batch size 1
231     // without reducing throughput at batch size 32.
232     const float kCostMultiplier = 2.5f;
233 
234     // TODO(andydavis): Estimate shard cost (in cycles) based on the number of
235     // flops/loads/stores required to compute one shard.
236     const int64 shard_cost = kCostMultiplier * args.out_cols * args.out_depth;
237 
238     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
239     Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
240           shard_cost, shard);
241   }
242 };
243 
244 // Extern template instantiated in conv_ops.cc.
245 extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
246 extern template struct LaunchConv2DOp<CPUDevice, float>;
247 extern template struct LaunchConv2DOp<CPUDevice, double>;
248 
249 #if GOOGLE_CUDA
250 
251 // Extern template instantiated in conv_ops.cc.
252 extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
253 extern template struct LaunchConv2DOp<GPUDevice, float>;
254 extern template struct LaunchConv2DOp<GPUDevice, double>;
255 
256 // Extern template instantiated in depthwise_conv_op_gpu.cc.
257 extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
258 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
259 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
260 
261 #endif
262 
263 template <typename Device, typename T>
264 class DepthwiseConv2dNativeOp : public BinaryOp<T> {
265  public:
DepthwiseConv2dNativeOp(OpKernelConstruction * context)266   explicit DepthwiseConv2dNativeOp(OpKernelConstruction* context)
267       : BinaryOp<T>(context) {
268     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
269     string data_format;
270     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
271     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
272                 errors::InvalidArgument("Invalid data format"));
273 
274     OP_REQUIRES(context, strides_.size() == 4,
275                 errors::InvalidArgument("Sliding window strides field must "
276                                         "specify 4 dimensions"));
277     stride_ = GetTensorDim(strides_, data_format_, 'H');
278     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
279     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
280     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
281 
282     OP_REQUIRES(context, stride_ == stride_w,
283                 errors::InvalidArgument(
284                     "Current implementation only supports equal length "
285                     "strides in the row and column dimensions."));
286     OP_REQUIRES(
287         context, (stride_n == 1 && stride_c == 1),
288         errors::InvalidArgument("Current implementation does not yet support "
289                                 "strides in the batch and depth dimensions."));
290     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
291 
292     // For in_depth == 1 and grouped convolutions.
293     use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
294     cudnn_use_autotune_ = CudnnUseAutotune();
295     use_cudnn_grouped_conv_ = false;
296     dtype_ = DataTypeToEnum<T>::value;
297   }
298 
Compute(OpKernelContext * context)299   void Compute(OpKernelContext* context) override {
300     // Input tensor is of the following dimensions:
301     // [ batch, in_rows, in_cols, in_depth ]
302     const Tensor& input = context->input(0);
303 
304     // Input filter is of the following dimensions:
305     // [ filter_rows, filter_cols, in_depth, depth_multiplier]
306     const Tensor& filter = context->input(1);
307 
308     // For 2D convolution, there should be 4 dimensions.
309     OP_REQUIRES(context, input.dims() == 4,
310                 errors::InvalidArgument("input must be 4-dimensional",
311                                         input.shape().DebugString()));
312     OP_REQUIRES(context, filter.dims() == 4,
313                 errors::InvalidArgument("filter must be 4-dimensional: ",
314                                         filter.shape().DebugString()));
315 
316     // in_depth for input and filter must match.
317     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
318     OP_REQUIRES(context, in_depth == filter.dim_size(2),
319                 errors::InvalidArgument(
320                     "input and filter must have the same depth: ", in_depth,
321                     " vs ", filter.dim_size(2)));
322 
323     // The last dimension for filter is depth multiplier.
324     const int32 depth_multiplier = filter.dim_size(3);
325 
326     // The output depth is input depth x depth multipler
327     const int32 out_depth = in_depth * depth_multiplier;
328 
329     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
330     OP_REQUIRES(
331         context,
332         FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),
333         errors::InvalidArgument("Input rows too large"));
334     const int32 input_rows = static_cast<int32>(input_rows_raw);
335     const int32 filter_rows = filter.dim_size(0);
336 
337     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
338     OP_REQUIRES(
339         context,
340         FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),
341         errors::InvalidArgument("Input cols too large"));
342     const int32 input_cols = static_cast<int32>(input_cols_raw);
343     const int32 filter_cols = filter.dim_size(1);
344 
345     // The first dimension for input is batch.
346     const int32 batch = input.dim_size(0);
347 
348     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
349     OP_REQUIRES_OK(context,
350                    GetWindowedOutputSize(input_rows, filter_rows, stride_,
351                                          padding_, &out_rows, &pad_rows));
352     OP_REQUIRES_OK(context,
353                    GetWindowedOutputSize(input_cols, filter_cols, stride_,
354                                          padding_, &out_cols, &pad_cols));
355     TensorShape out_shape =
356         ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
357     OP_REQUIRES(
358         context,
359         (!std::is_same<Device, GPUDevice>::value ||
360          FastBoundsCheck(out_shape.num_elements(),
361                          std::numeric_limits<int32>::max())),
362         errors::InvalidArgument("Output elements too large for GPU kernel"));
363 
364     Tensor* output = nullptr;
365     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
366 
367     // If there is nothing to compute, return.
368     if (out_shape.num_elements() == 0) {
369       return;
370     }
371 
372     // TODO(csigg): Have autotune decide if native is faster than cuDNN.
373     // If in_depth==1, this operation is just a standard convolution.
374     // Depthwise convolution is a special case of cuDNN's grouped convolution.
375     bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
376 
377     VLOG(2) << "DepthwiseConv2dNative: "
378             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
379             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
380             << filter_cols << ", " << in_depth << ", " << depth_multiplier
381             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
382             << ", " << out_depth << "], stride = " << stride_
383             << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
384             << ", Use cuDNN: " << use_cudnn;
385 
386     if (use_cudnn) {
387       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
388       //
389       //                  | TensorFlow       | cuDNN
390       // --------------------------------------------------------------------
391       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
392       // filter_in_depth  | in_depth         | in_depth / group_count
393       //
394       // For depthwise convolution, we have group_count == in_depth.
395       int32 filter_in_depth = 1;
396       TensorShape shape =
397           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
398       Tensor reshaped_filter(/*type=*/dtype_);
399       OP_REQUIRES(
400           context, reshaped_filter.CopyFrom(filter, shape),
401           errors::Internal(
402               "Failed to reshape filter tensor for grouped convolution."));
403       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
404       // conv is supported.
405       launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
406                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
407                 stride_, stride_, padding_, /*explicit_paddings=*/{}, output,
408                 data_format_);
409       return;
410     }
411 
412     DepthwiseArgs args;
413     args.batch = batch;
414     args.in_rows = input_rows;
415     args.in_cols = input_cols;
416     args.in_depth = in_depth;
417     args.filter_rows = filter_rows;
418     args.filter_cols = filter_cols;
419     args.depth_multiplier = depth_multiplier;
420     args.stride = stride_;
421     args.pad_rows = pad_rows;
422     args.pad_cols = pad_cols;
423     args.out_rows = out_rows;
424     args.out_cols = out_cols;
425     args.out_depth = out_depth;
426 
427     auto input_ptr = input.template flat<T>().data();
428     auto filter_ptr = filter.template flat<T>().data();
429     auto output_ptr = output->template flat<T>().data();
430     LaunchDepthwiseConvOp<Device, T>()(context, args, input_ptr, filter_ptr,
431                                        output_ptr, data_format_);
432   }
433 
434  protected:
435   bool use_cudnn_grouped_conv_;
436 
437  private:
438   std::vector<int32> strides_;
439   Padding padding_;
440   TensorFormat data_format_;
441 
442   int64 stride_;  // in height/width dimension.
443 
444   // For in_depth == 1 and grouped convolutions.
445   LaunchConv2DOp<Device, T> launcher_;
446   bool use_cudnn_;
447   bool cudnn_use_autotune_;
448   DataType dtype_;
449 
450   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
451 };
452 
453 #define REGISTER_CPU_KERNEL(T)                                                 \
454   REGISTER_KERNEL_BUILDER(                                                     \
455       Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
456       DepthwiseConv2dNativeOp<CPUDevice, T>)
457 
458 TF_CALL_half(REGISTER_CPU_KERNEL);
459 TF_CALL_float(REGISTER_CPU_KERNEL);
460 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
461 TF_CALL_double(REGISTER_CPU_KERNEL);
462 #endif
463 
464 #if GOOGLE_CUDA
465 
466 #define REGISTER_GPU_KERNEL(T)                                                 \
467   REGISTER_KERNEL_BUILDER(                                                     \
468       Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
469       DepthwiseConv2dNativeOp<GPUDevice, T>)
470 
471 TF_CALL_half(REGISTER_GPU_KERNEL);
472 TF_CALL_float(REGISTER_GPU_KERNEL);
473 TF_CALL_double(REGISTER_GPU_KERNEL);
474 
475 #if CUDNN_VERSION >= 7000
476 template <typename T>
477 class DepthwiseConv2dGroupedConvOp
478     : public DepthwiseConv2dNativeOp<GPUDevice, T> {
479  public:
DepthwiseConv2dGroupedConvOp(OpKernelConstruction * context)480   DepthwiseConv2dGroupedConvOp(OpKernelConstruction* context)
481       : DepthwiseConv2dNativeOp<GPUDevice, T>(context) {
482     this->use_cudnn_grouped_conv_ = true;
483   }
484 };
485 
486 #define REGISTER_GROUPED_CONV_KERNEL(T)                            \
487   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")            \
488                               .Device(DEVICE_GPU)                  \
489                               .TypeConstraint<T>("T")              \
490                               .Label("cudnn_grouped_convolution"), \
491                           DepthwiseConv2dGroupedConvOp<T>)
492 
493 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
494 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
495 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
496 #endif  // CUDNN_VERSION
497 #endif  // GOOGLE_CUDA
498 
499 }  // namespace tensorflow
500