1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #include <algorithm>
19 #include <cmath>
20 
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/numeric_op.h"
23 #include "tensorflow/core/framework/op_kernel.h"
24 #include "tensorflow/core/framework/register_types.h"
25 #include "tensorflow/core/framework/tensor.h"
26 #include "tensorflow/core/framework/tensor_shape.h"
27 #include "tensorflow/core/framework/tensor_types.h"
28 #include "tensorflow/core/framework/types.h"
29 #include "tensorflow/core/kernels/conv_grad_ops.h"
30 #include "tensorflow/core/kernels/depthwise_conv_op.h"
31 #include "tensorflow/core/kernels/ops_util.h"
32 #include "tensorflow/core/lib/core/status.h"
33 #include "tensorflow/core/platform/logging.h"
34 #include "tensorflow/core/platform/types.h"
35 #include "tensorflow/core/util/padding.h"
36 #include "tensorflow/core/util/tensor_format.h"
37 #include "tensorflow/core/util/use_cudnn.h"
38 #include "tensorflow/core/util/work_sharder.h"
39 
40 #if GOOGLE_CUDA
41 #include "cuda/include/cudnn.h"
42 #include "tensorflow/core/platform/stream_executor.h"
43 #endif  // GOOGLE_CUDA
44 
45 namespace tensorflow {
46 
47 // Gradient operations for depthwise convolution.
48 
49 typedef Eigen::ThreadPoolDevice CPUDevice;
50 typedef Eigen::GpuDevice GPUDevice;
51 
52 // Common code between the two backward pass kernels: verifies that the
53 // dimensions all match and extract the padded rows and columns.
54 #define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
55   const Tensor& out_backprop = context->input(2);                              \
56   OP_REQUIRES(                                                                 \
57       context, input_shape.dims() == 4,                                        \
58       errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
59   OP_REQUIRES(                                                                 \
60       context, filter_shape.dims() == 4,                                       \
61       errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
62   OP_REQUIRES(                                                                 \
63       context, out_backprop.dims() == 4,                                       \
64       errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
65   const int64 batch = input_shape.dim_size(0);                                 \
66   OP_REQUIRES(                                                                 \
67       context, batch == out_backprop.dim_size(0),                              \
68       errors::InvalidArgument(                                                 \
69           label, ": input and out_backprop must have the same batch size"));   \
70   const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
71   OP_REQUIRES(                                                                 \
72       context,                                                                 \
73       FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
74       errors::InvalidArgument("Input rows too large"));                        \
75   const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
76   const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
77   OP_REQUIRES(                                                                 \
78       context,                                                                 \
79       FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
80       errors::InvalidArgument("Input cols too large"));                        \
81   const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
82   const int64 filter_rows = filter_shape.dim_size(0);                          \
83   const int64 filter_cols = filter_shape.dim_size(1);                          \
84   const int64 output_rows_raw =                                                \
85       GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
86   OP_REQUIRES(                                                                 \
87       context,                                                                 \
88       FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
89       errors::InvalidArgument("Output rows too large"));                       \
90   const int32 output_rows = static_cast<int32>(output_rows_raw);               \
91   const int64 output_cols_raw =                                                \
92       GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
93   OP_REQUIRES(                                                                 \
94       context,                                                                 \
95       FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
96       errors::InvalidArgument("Output cols too large"));                       \
97   const int32 output_cols = static_cast<int32>(output_cols_raw);               \
98   const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
99   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
100               errors::InvalidArgument(                                         \
101                   label, ": input and filter must have the same in_depth"));   \
102   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
103   const int64 out_depth_raw =                                                  \
104       GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
105   OP_REQUIRES(                                                                 \
106       context,                                                                 \
107       FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
108       errors::InvalidArgument("Output depth too large"));                      \
109   const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
110   OP_REQUIRES(                                                                 \
111       context, (depth_multiplier * in_depth) == out_depth,                     \
112       errors::InvalidArgument(                                                 \
113           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
114   const auto stride = stride_;                                                 \
115   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                \
116   OP_REQUIRES_OK(context,                                                      \
117                  GetWindowedOutputSize(input_rows, filter_rows, stride,        \
118                                        padding_, &out_rows, &pad_rows));       \
119   OP_REQUIRES_OK(context,                                                      \
120                  GetWindowedOutputSize(input_cols, filter_cols, stride,        \
121                                        padding_, &out_cols, &pad_cols));       \
122   OP_REQUIRES(                                                                 \
123       context, output_rows == out_rows,                                        \
124       errors::InvalidArgument(                                                 \
125           label, ": Number of rows of out_backprop doesn't match computed: ",  \
126           "actual = ", output_rows, ", computed = ", out_rows));               \
127   OP_REQUIRES(                                                                 \
128       context, output_cols == out_cols,                                        \
129       errors::InvalidArgument(                                                 \
130           label, ": Number of cols of out_backprop doesn't match computed: ",  \
131           "actual = ", output_cols, ", computed = ", out_cols));               \
132   DepthwiseArgs args;                                                          \
133   args.batch = batch;                                                          \
134   args.in_rows = input_rows;                                                   \
135   args.in_cols = input_cols;                                                   \
136   args.in_depth = in_depth;                                                    \
137   args.filter_rows = filter_rows;                                              \
138   args.filter_cols = filter_cols;                                              \
139   args.depth_multiplier = depth_multiplier;                                    \
140   args.stride = stride;                                                        \
141   args.pad_rows = pad_rows;                                                    \
142   args.pad_cols = pad_cols;                                                    \
143   args.out_rows = out_rows;                                                    \
144   args.out_cols = out_cols;                                                    \
145   args.out_depth = out_depth;                                                  \
146   VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", "      \
147           << input_rows << ", " << input_cols << ", " << in_depth              \
148           << "]; Filter: [" << filter_rows << ", " << filter_cols << ", "      \
149           << in_depth << ", " << depth_multiplier << "]; stride = " << stride  \
150           << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols        \
151           << ", output: [" << batch << ", " << out_rows << ", " << out_cols    \
152           << ", " << out_depth << "]";
153 
154 // Copies data from local region in 'out_backprop' into 'buffer'.
155 // The local region coordinates are calculated as the set of output points which
156 // used the input point ('in_r', 'in_'c') as input during the forward pass.
157 // Rather than spatially reversing the filter, the input is reversed during
158 // the copy. The copied data is padded to vector register-width boundaries so
159 // that it is aligned for efficient traversal and vector multiply-add by the
160 // depthwise input kernel.
161 //
162 // EX:
163 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
164 //
165 //   'out_backprop': [batch, out_rows, out_cols, out_depth]
166 //
167 //     [a00, a01, a10, a11] [a20, a21, b00, b01]
168 //     [b10, b11, b20, b21] [...]
169 //     [e00, e01, e10, e11] [e20, e21, f00, f01]
170 //     [f10, f11, f20, f21] [...]
171 //
172 //   'buffer' (register boundaries shown):
173 //
174 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
175 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
176 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
177 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
178 //
179 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * out_backprop,T * buffer)180 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
181                                      const int64 padded_filter_inner_dim_size,
182                                      const int64 in_r, const int64 in_c,
183                                      const T* out_backprop, T* buffer) {
184   typedef typename Eigen::internal::packet_traits<T>::type Packet;
185   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
186 
187   const int64 stride = args.stride;
188   const int64 filter_rows = args.filter_rows;
189   const int64 filter_cols = args.filter_cols;
190   const int64 pad_rows = args.pad_rows;
191   const int64 pad_cols = args.pad_cols;
192   const int64 out_rows = args.out_rows;
193   const int64 out_cols = args.out_cols;
194 
195   // Calculate the output spatial region which used point (in_r, in_c) as input.
196   const int64 out_r_start = std::max(
197       static_cast<int64>(0), (in_r - filter_rows + pad_rows + stride) / stride);
198   const int64 out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
199   const int64 out_c_start = std::max(
200       static_cast<int64>(0), (in_c - filter_cols + pad_cols + stride) / stride);
201   const int64 out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
202 
203   // Zero-pad 'buffer' if output region is smaller than filter spatial size.
204   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
205   if ((out_r_end - out_r_start + 1) < args.filter_rows ||
206       (out_c_end - out_c_start + 1) < args.filter_cols) {
207     memset(buffer, 0,
208            filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
209   }
210 
211   // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
212   const int64 vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
213   const int64 scalar_size = args.out_depth % kPacketSize;
214   const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
215 
216   for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
217     const int64 f_r = in_r + pad_rows - out_r * stride;
218     for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
219       const int64 f_c = in_c + pad_cols - out_c * stride;
220       const int64 buf_base =
221           (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
222       // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
223       auto* out_bprop =
224           out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
225 
226       // Copy vectorized portion of inner dimension into 'buffer'.
227       for (int64 d = 0; d < vectorized_size; d += kPacketSize) {
228         auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
229         Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
230       }
231       // Copy scalar portion of out_bprop to 'buffer'
232       for (int64 d = 0; d < scalar_size; ++d) {
233         buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
234       }
235       // Pad to vector-register width (if needed).
236       for (int64 d = 0; d < pad_size; ++d) {
237         buffer[buf_base + vectorized_size + scalar_size + d] =
238             static_cast<T>(0);
239       }
240     }
241   }
242 }
243 
244 // Computes the vectorized product of 'buffer' and 'filter' and stores
245 // result in 'output' at location computed from 'in_r' and 'in_c'.
246 // If depth_multiplier is > 1, the intermediate output is reduced along
247 // the depth_multiplier dimension.
248 //
249 // EX:
250 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
251 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
252 //
253 //   'buffer' [rows, cols, in_depth, depth_multiplier]
254 //
255 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
256 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
257 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
258 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
259 //
260 //   filter [rows, cols, in_depth, depth_multiplier]
261 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
262 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
263 //
264 //   First output register [in_depth, depth_multiplier]
265 //     [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
266 //                            ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
267 //                            ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
268 //                            ([a00, a01, a10, a11] x [u3, v3, w3, x3])
269 //
270 //   Reduction step along depth-multiplier dimension:
271 //
272 //     [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
273 //
274 
275 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64 padded_filter_inner_dim_size,const int64 in_r,const int64 in_c,const T * filter,const T * buffer,T * out_buffer,T * output)276 static void ComputeBackpropInput(const DepthwiseArgs& args,
277                                  const int64 padded_filter_inner_dim_size,
278                                  const int64 in_r, const int64 in_c,
279                                  const T* filter, const T* buffer,
280                                  T* out_buffer, T* output) {
281   typedef typename Eigen::internal::packet_traits<T>::type Packet;
282   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
283 
284   const int64 in_depth = args.in_depth;
285   const int64 depth_multiplier = args.depth_multiplier;
286   const int64 out_depth = args.out_depth;
287   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
288 
289   // Calculate vectorized and scalar lengths of 'out_depth'.
290   const int64 output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
291   const int64 output_scalar_size = out_depth % kPacketSize;
292 
293   // Calculate base index at which to begin writing output.
294   const int64 base_output_index = (in_r * args.in_cols + in_c) * in_depth;
295 
296   // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
297   // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
298   const int64 dm_vectorized_size =
299       (depth_multiplier / kPacketSize) * kPacketSize;
300   const int64 dm_scalar_size = depth_multiplier % kPacketSize;
301 
302   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
303     // Reset accumulator.
304     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
305     for (int j = 0; j < filter_spatial_size; ++j) {
306       // Calculate index.
307       const int64 index = i + j * padded_filter_inner_dim_size;
308       // Load filter.
309       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
310       // Load input.
311       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
312       // Vector multiply-add.
313       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
314     }
315     if (depth_multiplier == 1) {
316       // Write directly to the output.
317       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
318     } else {
319       // Buffer output for subsequent reduction step.
320       Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
321     }
322   }
323 
324   if (output_scalar_size > 0) {
325     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
326     for (int j = 0; j < filter_spatial_size; ++j) {
327       const int64 index =
328           output_vectorized_size + j * padded_filter_inner_dim_size;
329       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
330       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
331       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
332     }
333     // Load accumulator into an array and loop through output.
334     T out_buf[kPacketSize];
335     Eigen::internal::pstoreu<T>(out_buf, vaccum);
336     if (depth_multiplier == 1) {
337       // Write directly to the output.
338       for (int j = 0; j < output_scalar_size; ++j) {
339         output[base_output_index + output_vectorized_size + j] = out_buf[j];
340       }
341     } else {
342       // Buffer output for subsequent reduction step.
343       for (int j = 0; j < output_scalar_size; ++j) {
344         out_buffer[output_vectorized_size + j] = out_buf[j];
345       }
346     }
347   }
348 
349   // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
350   if (depth_multiplier > 1) {
351     for (int64 d = 0; d < in_depth; ++d) {
352       const int64 index = d * args.depth_multiplier;
353       T accum = static_cast<T>(0);
354       for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
355         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
356         accum += Eigen::internal::predux(v);
357       }
358       // Copy scalar portion of replicated output.
359       for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
360         accum += out_buffer[index + dm_vectorized_size + dm];
361       }
362       // Copy to output.
363       output[base_output_index + d] = accum;
364     }
365   }
366 }
367 
368 // Computes the depthwise conv2d backprop input of 'out_backprop' by
369 // 'depthwise_filter' and stores the result in 'in_backprop'.
370 template <typename T>
371 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
372   typedef typename Eigen::internal::packet_traits<T>::type Packet;
373 
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp374   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
375                   const T* out_backprop, const T* depthwise_filter,
376                   T* in_backprop, TensorFormat data_format) {
377     OP_REQUIRES(
378         ctx, data_format == FORMAT_NHWC,
379         errors::Unimplemented(
380             "Depthwise convolution on CPU is only supported for NHWC format"));
381 
382     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
383 
384     // Pad 'depthwise_filter' to vector register width (if needed).
385     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
386     Tensor padded_filter;
387     if (pad_filter) {
388       // Allocate space for padded filter.
389       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
390       const int64 padded_filter_inner_dim_size =
391           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
392       OP_REQUIRES_OK(
393           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
394                                   TensorShape({filter_spatial_size,
395                                                padded_filter_inner_dim_size}),
396                                   &padded_filter));
397       // Write out padded filter.
398       functor::DepthwiseFilterPadOp<T>()(
399           args, depthwise_filter, padded_filter.template flat<T>().data());
400     }
401     const T* filter_data =
402         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
403 
404     // Computes one shard of depthwise conv2d backprop input.
405     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
406                      int64 start, int64 limit) {
407       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
408 
409       const int64 input_image_size =
410           args.in_rows * args.in_cols * args.in_depth;
411       const int64 output_image_size =
412           args.out_rows * args.out_cols * args.out_depth;
413       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
414       const int64 padded_filter_inner_dim_size =
415           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
416 
417       // Allocate buffer to copy regions from 'out_backprop'.
418       Tensor out_bprop_buffer;
419       OP_REQUIRES_OK(
420           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
421                                   TensorShape({filter_spatial_size,
422                                                padded_filter_inner_dim_size}),
423                                   &out_bprop_buffer));
424       T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
425 
426       // Allocate buffer for intermediate results.
427       Tensor in_bprop_buffer;
428       OP_REQUIRES_OK(
429           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
430                                   TensorShape({padded_filter_inner_dim_size}),
431                                   &in_bprop_buffer));
432       T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
433 
434       for (int64 b = start; b < limit; ++b) {
435         for (int64 in_r = 0; in_r < args.in_rows; ++in_r) {
436           for (int64 in_c = 0; in_c < args.in_cols; ++in_c) {
437             // Populate 'out_bprop_buf' from local 'out_backprop' region.
438             CopyOutputBackpropRegion<T>(
439                 args, padded_filter_inner_dim_size, in_r, in_c,
440                 out_backprop + b * output_image_size, out_bprop_buf);
441 
442             // Compute depthwise backprop input.
443             ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
444                                     in_c, filter_data, out_bprop_buf,
445                                     in_bprop_buf,
446                                     in_backprop + b * input_image_size);
447           }
448         }
449       }
450     };
451 
452     const int64 shard_cost = args.in_rows * args.in_cols * args.out_depth;
453     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
454     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
455           shard_cost, shard);
456   }
457 };
458 
459 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)460 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
461                                                 const T* out_backprop,
462                                                 const T* filter,
463                                                 T* in_backprop) {
464   // Naive for loop as a reference point without concerns about performance.
465   for (int b = 0; b < args.batch; ++b) {
466     for (int in_r = 0; in_r < args.in_rows; ++in_r) {
467       for (int in_c = 0; in_c < args.in_cols; ++in_c) {
468         for (int in_d = 0; in_d < args.in_depth; ++in_d) {
469           T sum = 0;
470           const int stride = args.stride;
471           const int out_d_start = in_d * args.depth_multiplier;
472           const int out_d_end = out_d_start + args.depth_multiplier;
473 
474           for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
475             const int out_r_start = std::max(
476                 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
477             const int out_r_end =
478                 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
479 
480             for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
481               const int out_c_start = std::max(
482                   0,
483                   (in_c - args.filter_cols + args.pad_cols + stride) / stride);
484               const int out_c_end =
485                   std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
486 
487               for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
488                 int f_r = in_r + args.pad_rows - out_r * stride;
489                 int f_c = in_c + args.pad_cols - out_c * stride;
490                 int filter_dm = out_d - out_d_start;
491                 int out_backprop_offset =
492                     out_d +
493                     args.out_depth *
494                         (out_c + args.out_cols * (out_r + args.out_rows * b));
495                 int filter_offset =
496                     filter_dm +
497                     args.depth_multiplier *
498                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
499                 sum +=
500                     out_backprop[out_backprop_offset] * filter[filter_offset];
501               }
502             }
503           }
504 
505           int in_backprop_offset =
506               in_d +
507               args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
508           in_backprop[in_backprop_offset] = sum;
509         }
510       }
511     }
512   }
513 }
514 
515 // Extern template instantiated in conv_grad_input_ops.cc.
516 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
517 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
518 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
519 
520 #if GOOGLE_CUDA
521 
522 // Extern template instantiated in conv_grad_input_ops.cc.
523 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
524 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
525 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
526 
527 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
528 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
529                                                           Eigen::half>;
530 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
531 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
532 
533 #endif  // GOOGLE_CUDA
534 
535 // Kernel to compute the input backprop for depthwise convolution.
536 template <typename Device, class T>
537 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
538  public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)539   explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
540       : OpKernel(context) {
541     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
542     OP_REQUIRES(context, strides_.size() == 4,
543                 errors::InvalidArgument("Sliding window strides field must "
544                                         "specify 4 dimensions"));
545 
546     string data_format;
547     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
548     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
549                 errors::InvalidArgument("Invalid data format"));
550 
551     stride_ = GetTensorDim(strides_, data_format_, 'H');
552     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
553     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
554     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
555 
556     OP_REQUIRES(context, stride_ == stride_w,
557                 errors::InvalidArgument(
558                     "Current implementation only supports equal length "
559                     "strides in the row and column dimensions."));
560     OP_REQUIRES(
561         context, (stride_n == 1 && stride_c == 1),
562         errors::InvalidArgument("Current implementation does not yet support "
563                                 "strides in the batch and depth dimensions."));
564     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
565 
566     // For in_depth == 1 and grouped convolutions.
567     use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
568     cudnn_use_autotune_ = CudnnUseAutotune();
569     use_cudnn_grouped_conv_ = false;
570     dtype_ = DataTypeToEnum<T>::value;
571   }
572 
Compute(OpKernelContext * context)573   void Compute(OpKernelContext* context) override {
574     const Tensor& input_sizes = context->input(0);
575     const Tensor& filter = context->input(1);
576     OP_REQUIRES(
577         context, TensorShapeUtils::IsVector(input_sizes.shape()),
578         errors::InvalidArgument(
579             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
580             input_sizes.dims()));
581     TensorShape input_shape;
582     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
583 
584     for (int i = 0; i < input_sizes.NumElements(); ++i) {
585       OP_REQUIRES(context, in_sizes_data[i] >= 0,
586                   errors::InvalidArgument("Dimension ", i,
587                                           " of input_sizes must be >= 0"));
588       input_shape.AddDim(in_sizes_data[i]);
589     }
590     const TensorShape& filter_shape = filter.shape();
591     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
592 
593     Tensor* in_backprop = nullptr;
594     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
595                                 {0}, 0, input_shape, &in_backprop));
596 
597     // If there is nothing to compute, return.
598     if (input_shape.num_elements() == 0) {
599       return;
600     }
601 
602     // If in_depth==1, this operation is just a standard convolution.
603     // Depthwise convolution is a special case of cuDNN's grouped convolution.
604     bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
605 
606     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
607             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
608             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
609             << filter_cols << ", " << in_depth << ", " << depth_multiplier
610             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
611             << ", " << out_depth << "], stride = " << stride_
612             << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
613             << ", Use cuDNN: " << use_cudnn;
614 
615     if (use_cudnn) {
616       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
617       //
618       //                  | TensorFlow       | cuDNN
619       // --------------------------------------------------------------------
620       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
621       // filter_in_depth  | in_depth         | in_depth / group_count
622       //
623       // For depthwise convolution, we have group_count == in_depth.
624       int32 filter_in_depth = 1;
625       TensorShape shape =
626           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
627       Tensor reshaped_filter(/*type=*/dtype_);
628       OP_REQUIRES(
629           context, reshaped_filter.CopyFrom(filter, shape),
630           errors::Internal(
631               "Failed to reshape filter tensor for grouped convolution."));
632       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
633       // conv is supported.
634       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
635                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
636                 stride_, stride_, padding_, /*explicit_paddings=*/{},
637                 in_backprop, data_format_);
638       return;
639     }
640 
641     auto out_backprop_ptr = out_backprop.template flat<T>().data();
642     auto filter_ptr = filter.template flat<T>().data();
643     auto in_backprop_ptr = in_backprop->template flat<T>().data();
644     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
645         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
646         data_format_);
647   }
648 
649  protected:
650   bool use_cudnn_grouped_conv_;
651 
652  private:
653   std::vector<int32> strides_;
654   Padding padding_;
655   TensorFormat data_format_;
656   int64 stride_;
657 
658   // For in_depth == 1 and grouped convolutions.
659   LaunchConv2DBackpropInputOp<Device, T> launcher_;
660   bool use_cudnn_;
661   bool cudnn_use_autotune_;
662   DataType dtype_;
663 
664   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
665 };
666 
667 #define REGISTER_CPU_KERNEL(T)                                       \
668   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
669                               .Device(DEVICE_CPU)                    \
670                               .TypeConstraint<T>("T"),               \
671                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
672 
673 TF_CALL_half(REGISTER_CPU_KERNEL);
674 TF_CALL_float(REGISTER_CPU_KERNEL);
675 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
676 TF_CALL_double(REGISTER_CPU_KERNEL);
677 #endif
678 #undef REGISTER_CPU_KERNEL
679 
680 #if GOOGLE_CUDA
681 
682 #define REGISTER_GPU_KERNEL(T)                                       \
683   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
684                               .Device(DEVICE_GPU)                    \
685                               .TypeConstraint<T>("T")                \
686                               .HostMemory("input_sizes"),            \
687                           DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
688 
689 TF_CALL_half(REGISTER_GPU_KERNEL);
690 TF_CALL_float(REGISTER_GPU_KERNEL);
691 TF_CALL_double(REGISTER_GPU_KERNEL);
692 #undef REGISTER_GPU_KERNEL
693 
694 #if CUDNN_VERSION >= 7000
695 template <typename T>
696 class DepthwiseConv2dGroupedConvBackpropInputOp
697     : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
698  public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)699   DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
700       : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
701     this->use_cudnn_grouped_conv_ = true;
702   }
703 };
704 
705 #define REGISTER_GROUPED_CONV_KERNEL(T)                              \
706   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
707                               .Device(DEVICE_GPU)                    \
708                               .TypeConstraint<T>("T")                \
709                               .HostMemory("input_sizes")             \
710                               .Label("cudnn_grouped_convolution"),   \
711                           DepthwiseConv2dGroupedConvBackpropInputOp<T>)
712 
713 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
714 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
715 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
716 #undef REGISTER_GROUPED_CONV_KERNEL
717 #endif  // CUDNN_VERSION
718 #endif  // GOOGLE_CUDA
719 
720 // Kernels to compute the gradients of the filters for depthwise convolution.
721 
722 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
723 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
724 //
725 // EX:
726 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
727 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
728 //
729 //   'input_buffer' [rows, cols, in_depth, depth_multiplier]
730 //
731 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
732 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
733 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
734 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
735 //
736 //   'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
737 //
738 //     [q00, q01, q10, q11] [q20, q21, r00, r01]
739 //     [r10, r11, r20, r21] [s00, s01, s10, s11]
740 //     [s20, s21, t00, t01] [t10, t11, t20, a21]
741 //
742 //   First output register of 'filter_backprop'
743 //     [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
744 //
745 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64 padded_out_depth_size,const int64 out_r,const int64 out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)746 static void ComputeBackpropFilter(const DepthwiseArgs& args,
747                                   const int64 padded_out_depth_size,
748                                   const int64 out_r, const int64 out_c,
749                                   const T* out_backprop, const T* input_buffer,
750                                   T* output_buffer) {
751   typedef typename Eigen::internal::packet_traits<T>::type Packet;
752   static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
753   // Calculate vectorized size of 'padded_out_depth_size'.
754   const int64 out_depth = args.out_depth;
755   const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
756   const int64 output_vectorized_size =
757       (padded_out_depth_size / kPacketSize) * kPacketSize;
758   const int64 base_output_index = (out_r * args.out_cols + out_c) * out_depth;
759   // Determine whether we can execute fast or slow code path.
760   const int64 output_image_size =
761       args.out_rows * args.out_cols * args.out_depth;
762   const int64 output_last_vector_index =
763       output_image_size - (filter_spatial_size * padded_out_depth_size);
764   const bool fast_path = base_output_index <= output_last_vector_index;
765 
766   if (fast_path) {
767     // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
768     // amortize the cost of 'output_buffer' load store in the loop below.
769     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
770       // Load vector register from 'out_backprop'.
771       const auto out_bprop_block =
772           Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
773       for (int j = 0; j < filter_spatial_size; ++j) {
774         const int64 index = i + j * padded_out_depth_size;
775         // Load vector register from 'input_buffer'.
776         const auto input_block =
777             Eigen::internal::ploadu<Packet>(input_buffer + index);
778         // Load output block into vector register.
779         auto out_block_data = output_buffer + index;
780         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
781         // Vector multiply-add.
782         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
783                                                    out_block);
784         // Store 'out_block' back to memory.
785         Eigen::internal::pstoreu<T>(out_block_data, out_block);
786       }
787     }
788   } else {
789     // Slow path (cant do vector reads from non-padded 'out_backprop'.
790     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
791       // Calculate safe read size from 'out_backprop'.
792       const int64 out_bprop_index = base_output_index + i;
793       const int64 out_bprop_limit =
794           std::min(output_image_size, out_bprop_index + kPacketSize);
795       T out_buf[kPacketSize];
796       memset(&out_buf, 0, kPacketSize * sizeof(T));
797       const int64 scalar_size = out_bprop_limit - out_bprop_index;
798       for (int64 j = 0; j < scalar_size; ++j) {
799         out_buf[j] = out_backprop[out_bprop_index + j];
800       }
801       // Load vector register from 'out_buf'.
802       const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
803       for (int j = 0; j < filter_spatial_size; ++j) {
804         const int64 index = i + j * padded_out_depth_size;
805         // Load vector register from 'input_buffer'.
806         const auto input_block =
807             Eigen::internal::ploadu<Packet>(input_buffer + index);
808         // Load output block into vector register.
809         auto out_block_data = output_buffer + index;
810         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
811         // Vector multiply-add.
812         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
813                                                    out_block);
814         // Store 'out_block' back to memory.
815         Eigen::internal::pstoreu<T>(out_block_data, out_block);
816       }
817     }
818   }
819 }
820 
821 template <typename Device, typename T>
822 struct LaunchDepthwiseConvBackpropFilterOp;
823 
824 template <typename T>
825 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
826   typedef typename Eigen::internal::packet_traits<T>::type Packet;
827 
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp828   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
829                   const T* out_backprop, const T* input, T* filter_backprop,
830                   TensorFormat data_format) {
831     OP_REQUIRES(
832         ctx, data_format == FORMAT_NHWC,
833         errors::Unimplemented(
834             "Depthwise convolution on CPU is only supported for NHWC format"));
835 
836     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
837 
838     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
839     const int64 padded_out_depth_size =
840         ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
841 
842     // Allocate output buffers for each image in 'batch' (padded to vector
843     // register boundaries).
844     Tensor output_buffer;
845     OP_REQUIRES_OK(
846         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
847                                 TensorShape({args.batch, filter_spatial_size,
848                                              padded_out_depth_size}),
849                                 &output_buffer));
850     T* output_buffer_data = output_buffer.template flat<T>().data();
851 
852     // Computes one shard of depthwise conv2d backprop filter.
853     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
854                      int64 start, int64 limit) {
855       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
856       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
857       const int64 padded_out_depth_size =
858           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
859 
860       // Allocate buffer for local input regions.
861       Tensor input_buffer;
862       OP_REQUIRES_OK(
863           ctx, ctx->allocate_temp(
864                    DataTypeToEnum<T>::value,
865                    TensorShape({filter_spatial_size, padded_out_depth_size}),
866                    &input_buffer));
867       T* input_buffer_data = input_buffer.template flat<T>().data();
868 
869       const int64 input_image_size =
870           args.in_rows * args.in_cols * args.in_depth;
871       const int64 output_image_size =
872           args.out_rows * args.out_cols * args.out_depth;
873       const int64 padded_filter_size =
874           filter_spatial_size * padded_out_depth_size;
875 
876       for (int b = start; b < limit; ++b) {
877         // Initialize 'output_buffer' for 'b'.
878         auto* output_buffer = output_buffer_data + b * padded_filter_size;
879         memset(output_buffer, 0, padded_filter_size * sizeof(T));
880 
881         for (int out_r = 0; out_r < args.out_rows; ++out_r) {
882           for (int out_c = 0; out_c < args.out_cols; ++out_c) {
883             // Populate 'input_buffer_data' with data from local input region.
884             functor::DepthwiseInputCopyOp<T>()(
885                 args, padded_out_depth_size, out_r, out_c,
886                 input + b * input_image_size, input_buffer_data);
887             // Compute depthwise backprop filter.
888             ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
889                                   out_backprop + b * output_image_size,
890                                   input_buffer_data, output_buffer);
891           }
892         }
893       }
894     };
895     const int64 shard_cost = args.out_rows * args.out_cols * args.out_depth;
896     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
897     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
898           shard_cost, shard);
899 
900     // Accumulate 'output_buffer' from each shard into 'output'.
901     const int64 out_depth = args.out_depth;
902     const int64 vectorized_size = (out_depth / kPacketSize) * kPacketSize;
903     const int64 scalar_size = out_depth - vectorized_size;
904     const int64 padded_filter_size =
905         filter_spatial_size * padded_out_depth_size;
906     memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
907 
908     for (int64 i = 0; i < filter_spatial_size; ++i) {
909       const int64 buffer_base = i * padded_out_depth_size;
910       const int64 output_base = i * out_depth;
911       // Write vectorized length of filter's inner dimension to output.
912       for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
913         // Load data from 'filter_backprop' into vector register.
914         auto out_block_data = filter_backprop + output_base + j;
915         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
916         for (int b = 0; b < args.batch; ++b) {
917           // Load data from 'output_buffer' for 'b'.
918           const auto* output_buffer =
919               output_buffer_data + b * padded_filter_size;
920           const auto v =
921               Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
922           // Add 'v' to 'out_block'.
923           out_block = Eigen::internal::padd<Packet>(out_block, v);
924         }
925         // Store 'out_block' back to memory.
926         Eigen::internal::pstoreu<T>(out_block_data, out_block);
927       }
928       // Write scalar length of filter's inner dimension to output.
929       for (int64 j = 0; j < scalar_size; ++j) {
930         for (int b = 0; b < args.batch; ++b) {
931           const auto* output_buffer =
932               output_buffer_data + b * padded_filter_size;
933           filter_backprop[output_base + vectorized_size + j] +=
934               output_buffer[buffer_base + vectorized_size + j];
935         }
936       }
937     }
938   }
939 };
940 
941 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)942 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
943                                                  const T* out_backprop,
944                                                  const T* input,
945                                                  T* filter_backprop) {
946   int num_filter_backprop = args.filter_rows * args.filter_cols *
947                             args.in_depth * args.depth_multiplier;
948   memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
949   // Naive for loop as a reference point without concerns about performance.
950   for (int b = 0; b < args.batch; ++b) {
951     for (int out_r = 0; out_r < args.out_rows; ++out_r) {
952       for (int out_c = 0; out_c < args.out_cols; ++out_c) {
953         for (int out_d = 0; out_d < args.out_depth; ++out_d) {
954           const int in_d = out_d / args.depth_multiplier;
955           const int dm = out_d % args.depth_multiplier;
956           const int in_r_start = out_r * args.stride - args.pad_rows;
957           const int in_c_start = out_c * args.stride - args.pad_cols;
958 
959           for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
960             for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
961               const int in_r = in_r_start + f_r;
962               const int in_c = in_c_start + f_c;
963 
964               if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
965                   in_c < args.in_cols) {
966                 int out_backprop_offset =
967                     out_d +
968                     args.out_depth *
969                         (out_c + args.out_cols * (out_r + args.out_rows * b));
970                 int input_offset =
971                     in_d +
972                     args.in_depth *
973                         (in_c + args.in_cols * (in_r + args.in_rows * b));
974                 int filter_backprop_offset =
975                     dm +
976                     args.depth_multiplier *
977                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
978                 filter_backprop[filter_backprop_offset] +=
979                     input[input_offset] * out_backprop[out_backprop_offset];
980               }
981             }
982           }
983         }
984       }
985     }
986   }
987 }
988 
989 // Extern template instantiated in conv_grad_filter_ops.cc.
990 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
991 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
992 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
993 
994 #if GOOGLE_CUDA
995 
996 // Extern template instantiated in conv_grad_filter_ops.cc.
997 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
998 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
999 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1000 
1001 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1002 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1003                                                            Eigen::half>;
1004 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1005 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1006 
1007 #endif  // GOOGLE_CUDA
1008 
1009 // Kernel to compute the filter backprop for depthwise convolution.
1010 template <typename Device, class T>
1011 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1012  public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1013   explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1014       : OpKernel(context) {
1015     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1016     OP_REQUIRES(context, strides_.size() == 4,
1017                 errors::InvalidArgument("Sliding window strides field must "
1018                                         "specify 4 dimensions"));
1019 
1020     string data_format;
1021     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1022     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1023                 errors::InvalidArgument("Invalid data format"));
1024 
1025     stride_ = GetTensorDim(strides_, data_format_, 'H');
1026     const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
1027     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
1028     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
1029 
1030     OP_REQUIRES(context, stride_ == stride_w,
1031                 errors::InvalidArgument(
1032                     "Current implementation only supports equal length "
1033                     "strides in the row and column dimensions."));
1034     OP_REQUIRES(
1035         context, (stride_n == 1 && stride_c == 1),
1036         errors::InvalidArgument("Current implementation does not yet support "
1037                                 "strides in the batch and depth dimensions."));
1038     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1039 
1040     // For in_depth == 1 and grouped convolutions.
1041     use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
1042     cudnn_use_autotune_ = CudnnUseAutotune();
1043     use_cudnn_grouped_conv_ = false;
1044 
1045     if (std::is_same<T, Eigen::half>::value) {
1046       dtype_ = DT_HALF;
1047     } else if (std::is_same<T, float>::value) {
1048       dtype_ = DT_FLOAT;
1049     } else if (std::is_same<T, double>::value) {
1050       dtype_ = DT_DOUBLE;
1051     } else {
1052       LOG(ERROR) << "Only half, float, and double are supported.";
1053     }
1054   }
1055 
Compute(OpKernelContext * context)1056   void Compute(OpKernelContext* context) override {
1057     const Tensor& input = context->input(0);
1058     const Tensor& filter_sizes = context->input(1);
1059     OP_REQUIRES(
1060         context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1061         errors::InvalidArgument(
1062             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1063             filter_sizes.dims()));
1064     TensorShape filter_shape;
1065     const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1066     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1067       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1068                   errors::InvalidArgument("Dimension ", i,
1069                                           " of filter_sizes must be >= 0"));
1070       filter_shape.AddDim(filter_sizes_data[i]);
1071     }
1072     const TensorShape& input_shape = input.shape();
1073 
1074     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1075     Tensor* filter_backprop = nullptr;
1076     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1077                                 {1}, 0, filter_shape, &filter_backprop));
1078 
1079     // If there is nothing to compute, return.
1080     if (out_backprop.shape().num_elements() == 0) {
1081       return;
1082     }
1083 
1084     // If in_depth==1, this operation is just a standard convolution.
1085     // Depthwise convolution is a special case of cuDNN's grouped convolution.
1086     bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
1087 
1088     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1089             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1090             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1091             << filter_cols << ", " << in_depth << ", " << depth_multiplier
1092             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1093             << ", " << out_depth << "], stride = " << stride_
1094             << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
1095             << ", Use cuDNN: " << use_cudnn;
1096 
1097     if (use_cudnn) {
1098       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1099       //
1100       //                  | TensorFlow       | cuDNN
1101       // --------------------------------------------------------------------
1102       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1103       // filter_in_depth  | in_depth         | in_depth / group_count
1104       //
1105       // For depthwise convolution, we have group_count == in_depth.
1106       int32 filter_in_depth = 1;
1107       TensorShape shape =
1108           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1109       Tensor reshaped_filter(/*type=*/dtype_);
1110       OP_REQUIRES(
1111           context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1112           errors::Internal(
1113               "Failed to reshape filter tensor for grouped convolution."));
1114 
1115       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1116       // conv is supported.
1117       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
1118                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1119                 padding_, /*explicit_paddings=*/{}, &reshaped_filter,
1120                 data_format_);
1121       return;
1122     }
1123 
1124     auto out_backprop_ptr = out_backprop.template flat<T>().data();
1125     auto input_ptr = input.template flat<T>().data();
1126     auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
1127     LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
1128         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1129         data_format_);
1130   }
1131 
1132  protected:
1133   bool use_cudnn_grouped_conv_;
1134 
1135  private:
1136   std::vector<int32> strides_;
1137   Padding padding_;
1138   TensorFormat data_format_;
1139   int64 stride_;
1140 
1141   // For in_depth == 1 and grouped convolutions.
1142   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1143   bool use_cudnn_;
1144   bool cudnn_use_autotune_;
1145   DataType dtype_;
1146 
1147   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1148 };
1149 
1150 #define REGISTER_CPU_KERNEL(T)                    \
1151   REGISTER_KERNEL_BUILDER(                        \
1152       Name("DepthwiseConv2dNativeBackpropFilter") \
1153           .Device(DEVICE_CPU)                     \
1154           .TypeConstraint<T>("T"),                \
1155       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1156 TF_CALL_half(REGISTER_CPU_KERNEL);
1157 TF_CALL_float(REGISTER_CPU_KERNEL);
1158 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1159 TF_CALL_double(REGISTER_CPU_KERNEL);
1160 #endif
1161 #undef REGISTER_CPU_KERNEL
1162 
1163 #if GOOGLE_CUDA
1164 #define REGISTER_GPU_KERNEL(T)                                        \
1165   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1166                               .Device(DEVICE_GPU)                     \
1167                               .TypeConstraint<T>("T")                 \
1168                               .HostMemory("filter_sizes"),            \
1169                           DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1170 
1171 TF_CALL_half(REGISTER_GPU_KERNEL);
1172 TF_CALL_float(REGISTER_GPU_KERNEL);
1173 TF_CALL_double(REGISTER_GPU_KERNEL);
1174 #undef REGISTER_GPU_KERNEL
1175 
1176 #if CUDNN_VERSION >= 7000
1177 template <typename T>
1178 class DepthwiseConv2dGroupedConvBackpropFilterOp
1179     : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1180  public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1181   DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1182       : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1183     this->use_cudnn_grouped_conv_ = true;
1184   }
1185 };
1186 
1187 #define REGISTER_GROUPED_CONV_KERNEL(T)                               \
1188   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1189                               .Device(DEVICE_GPU)                     \
1190                               .TypeConstraint<T>("T")                 \
1191                               .HostMemory("filter_sizes")             \
1192                               .Label("cudnn_grouped_convolution"),    \
1193                           DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1194 
1195 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1196 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1197 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1198 #undef REGISTER_GROUPED_CONV_KERNEL
1199 #endif  // CUDNN_VERSION
1200 #endif  // GOOGLE_CUDA
1201 
1202 }  // namespace tensorflow
1203