1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ 17 #define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ 18 19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 20 #include "tensorflow/core/framework/types.h" 21 #include "tensorflow/core/util/tensor_format.h" 22 23 namespace tensorflow { 24 25 struct DepthwiseArgs { 26 // Input layer dimensions 27 int batch; 28 int in_rows; 29 int in_cols; 30 int in_depth; 31 int filter_rows; 32 int filter_cols; 33 int depth_multiplier; 34 int stride; 35 int pad_rows; // Amount of padding to the top of the input 36 int pad_cols; // Amount of padding to the left of the input 37 38 // Output layer dimensions 39 int out_rows; 40 int out_cols; 41 int out_depth; 42 DepthwiseArgsDepthwiseArgs43 DepthwiseArgs() 44 : batch(0), 45 in_rows(0), 46 in_cols(0), 47 in_depth(0), 48 filter_rows(0), 49 filter_cols(0), 50 depth_multiplier(0), 51 stride(0), 52 pad_rows(0), 53 pad_cols(0), 54 out_rows(0), 55 out_cols(0), 56 out_depth(0) {} 57 }; 58 59 // Forward declaration. 60 class OpKernelContext; 61 62 template <typename Device, typename T> 63 struct LaunchDepthwiseConvOp { 64 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 65 const T* input, const T* filter, T* output, 66 TensorFormat data_format); 67 }; 68 69 template <typename Device, typename T> 70 struct LaunchDepthwiseConvBackpropInputOp { 71 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 72 const T* out_backprop, const T* filter, T* in_backprop, 73 TensorFormat data_format); 74 }; 75 76 template <typename Device, typename T> 77 struct LaunchDepthwiseConvBackpropFilterOp { 78 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 79 const T* out_backprop, const T* input, T* filter_backprop, 80 TensorFormat data_format); 81 }; 82 83 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 84 template <typename T> 85 struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> { 86 void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, 87 const T* input, const T* filter, T* output, 88 TensorFormat data_format); 89 }; 90 91 template <typename T> 92 struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> { 93 void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, 94 const T* out_backprop, const T* filter, T* in_backprop, 95 TensorFormat data_format); 96 }; 97 98 template <typename T> 99 struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> { 100 void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, 101 const T* out_backprop, const T* input, T* filter_backprop, 102 TensorFormat data_format); 103 }; 104 #endif 105 106 } // namespace tensorflow 107 108 namespace tensorflow { 109 namespace functor { 110 111 // Pads 'filter' to vector-register boundary along its inner dimension: 112 // filter_inner_dim_size = in_depth * depth_multiplier 113 // Requires 'filter' to have the following storage order: 114 // [filter_rows, filter_cols, in_depth, depth_multiplier] 115 // Returns zero-padded filter in 'padded_filter'. 116 // 117 // EX: 118 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 119 // So we have a total of 3 * 2 = 6 filters, each of spatial size 2 x 2. 120 // 121 // filter [rows, cols, in_depth, depth_multiplier] 122 // [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1] 123 // [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3] 124 // 125 // padded_filter [rows, cols, in_depth, depth_multiplier] 126 // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0] 127 // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0] 128 129 template <typename T> 130 struct DepthwiseFilterPadOp { 131 void operator()(const DepthwiseArgs& args, const T* filter, 132 T* padded_filter) { 133 typedef typename Eigen::internal::packet_traits<T>::type Packet; 134 static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); 135 136 // Calculate vectorized and scalar lengths of filter's inner dimension. 137 const int64 filter_inner_dim_size = args.out_depth; 138 const int64 vectorized_size = 139 (filter_inner_dim_size / kPacketSize) * kPacketSize; 140 const int64 scalar_size = filter_inner_dim_size - vectorized_size; 141 // Calculate required padding and padded output buffer stride. 142 const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0; 143 const int64 padded_filter_stride = vectorized_size + kPacketSize; 144 145 const int64 filter_spatial_size = args.filter_rows * args.filter_cols; 146 for (int64 i = 0; i < filter_spatial_size; ++i) { 147 const int64 input_base = i * filter_inner_dim_size; 148 const int64 output_base = i * padded_filter_stride; 149 // Write vectorized length of filter's inner dimension to output. 150 for (int64 j = 0; j < vectorized_size; j += kPacketSize) { 151 const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j); 152 Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v); 153 } 154 // Write scalar length of filter's inner dimension to output. 155 for (int64 j = 0; j < scalar_size; ++j) { 156 padded_filter[output_base + vectorized_size + j] = 157 filter[input_base + vectorized_size + j]; 158 } 159 // Pad the remainder of output to vector-register boundary. 160 for (int64 j = 0; j < pad_size; ++j) { 161 padded_filter[output_base + vectorized_size + scalar_size + j] = 162 static_cast<T>(0); 163 } 164 } 165 } 166 }; 167 168 // Copies data from local region in 'input' specified by 'out_r' and 'out_'c' 169 // to 'input_buffer'. The copied data is replicated by factor 170 // 'args.depth_multiplier', and padded to vector register-width boundaries so 171 // that it is aligned for efficient traversal and vector multiply-add by the 172 // depthwise kernel. 173 // 174 // EX: 175 // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 176 // 177 // input: [batch, in_rows, in_cols, in_depth] 178 // 179 // [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...] 180 // 181 // input_buffer (register boundaries shown): 182 // [a0, a0, a1, a1] [a2, a2, 0, 0] in_row = 0, in_col = 0 183 // [b0, b0, b1, b1] [b2, b2, 0, 0] in_row = 0, in_col = 1 184 // [e0, e0, e1, e1] [e2, e2, 0, 0] in_row = 1, in_col = 0 185 // [f0, f0, f1, f1] [f2, f2, 0, 0] in_row = 1, in_col = 1 186 // 187 // Returns replicated and padded data from specified input region in 188 // 'input_buffer'. 189 190 template <typename T> 191 struct DepthwiseInputCopyOp { 192 void operator()(const DepthwiseArgs& args, 193 const int64 padded_filter_inner_dim_size, const int64 out_r, 194 const int64 out_c, const T* input, T* input_buffer) { 195 typedef typename Eigen::internal::packet_traits<T>::type Packet; 196 static const int64 kPacketSize = Eigen::internal::packet_traits<T>::size; 197 198 const int64 kDepth = args.depth_multiplier; 199 // Calculate vectorized and scalar (residual) lengths for 'in_depth'. 200 const int64 input_vectorized_size = 201 (args.in_depth / kPacketSize) * kPacketSize; 202 const int64 input_scalar_size = args.in_depth - input_vectorized_size; 203 204 // Calculate output padding length. 205 const int64 output_scalar_size = args.out_depth % kPacketSize; 206 const int64 output_pad_size = 207 output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0; 208 209 // Iterate through all rows x cols reading 'in_depth' from 'input' and 210 // replicating by 'depth_multiplier' into 'input_buffer' (otherwise 211 // zero-padding input buffer as needed). 212 auto* in_buf = input_buffer; 213 const int64 in_r_start = out_r * args.stride - args.pad_rows; 214 const int64 in_c_start = out_c * args.stride - args.pad_cols; 215 216 // TODO: add a ploaddup variant for depth == 2 if needed. 217 if (kDepth > 1 && kDepth <= kPacketSize) { 218 for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) { 219 const int64 in_r = in_r_start + f_r; 220 221 for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) { 222 const int64 in_c = in_c_start + f_c; 223 224 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && 225 in_c < args.in_cols) { 226 const auto* in = 227 input + (in_r * args.in_cols + in_c) * args.in_depth; 228 int64 limit = args.in_depth; 229 // This will overwrite up to kPacketSize next elements, 230 // this is ok on all iterations except the last one, since 231 // we will write correct values on a next iteration. 232 if (f_c == args.filter_cols - 1) { 233 limit -= (kPacketSize - kDepth) / kDepth + 1; 234 if (limit < 0) { 235 limit = 0; 236 } 237 } 238 // Copy vectorized portion of inner dimension. 239 for (int64 d = 0; d < limit; d++) { 240 const auto p = Eigen::internal::pset1<Packet>(in[d]); 241 Eigen::internal::pstoreu<T>(in_buf, p); 242 in_buf += kDepth; 243 } 244 245 // Copy the scalar portion. 246 for (int64 d = limit; d < args.in_depth; d++) { 247 const auto value = in[d]; 248 for (int64 dm = 0; dm < kDepth; dm++) { 249 in_buf[dm] = value; 250 } 251 in_buf += kDepth; 252 } 253 254 // Pad the remainder of the output to vector register boundary. 255 for (int64 d = 0; d < output_pad_size; ++d) { 256 in_buf[d] = static_cast<T>(0); 257 } 258 in_buf += output_pad_size; 259 } else { 260 // Zero pad. 261 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); 262 in_buf += padded_filter_inner_dim_size; 263 } 264 } 265 } 266 } else if (kDepth > kPacketSize) { 267 // Calculate vectorized and scalar (residual) lengths for 268 // 'depth_multiplier'. This is used to efficiently replicate data for 269 // when 'depth_multiplier' > kPacketSize. 270 const int64 dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize; 271 272 for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) { 273 const int64 in_r = in_r_start + f_r; 274 275 for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) { 276 const int64 in_c = in_c_start + f_c; 277 278 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && 279 in_c < args.in_cols) { 280 const auto* in = 281 input + (in_r * args.in_cols + in_c) * args.in_depth; 282 // Copy vectorized portion of inner dimension. 283 for (int64 d = 0; d < args.in_depth; d++) { 284 const auto p = Eigen::internal::pset1<Packet>(in[d]); 285 for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) { 286 Eigen::internal::pstoreu<T>(in_buf + dm, p); 287 } 288 // Overlapping store for the remainder. 289 Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p); 290 in_buf += kDepth; 291 } 292 // Pad the remainder of the output to vector register boundary. 293 for (int64 d = 0; d < output_pad_size; ++d) { 294 in_buf[d] = static_cast<T>(0); 295 } 296 in_buf += output_pad_size; 297 } else { 298 // Zero pad. 299 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); 300 in_buf += padded_filter_inner_dim_size; 301 } 302 } 303 } 304 } else if (kDepth == 1) { 305 for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) { 306 const int64 in_r = in_r_start + f_r; 307 308 for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) { 309 const int64 in_c = in_c_start + f_c; 310 311 if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && 312 in_c < args.in_cols) { 313 const auto* in = 314 input + (in_r * args.in_cols + in_c) * args.in_depth; 315 for (int64 d = 0; d < input_vectorized_size; d += kPacketSize) { 316 const auto p = Eigen::internal::ploadu<Packet>(in + d); 317 Eigen::internal::pstoreu<T>(in_buf, p); 318 in_buf += kPacketSize; 319 } 320 for (int64 d = 0; d < input_scalar_size; ++d) { 321 T v = in[input_vectorized_size + d]; 322 in_buf[d] = v; 323 } 324 in_buf += input_scalar_size; 325 326 // Pad the remainder of the output to vector register boundary. 327 for (int64 d = 0; d < output_pad_size; ++d) { 328 in_buf[d] = static_cast<T>(0); 329 } 330 in_buf += output_pad_size; 331 } else { 332 // Zero pad. 333 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); 334 in_buf += padded_filter_inner_dim_size; 335 } 336 } 337 } 338 } 339 } 340 }; 341 342 } // namespace functor 343 } // namespace tensorflow 344 345 #endif // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ 346