1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Implements convolution operations with image transformations (resize and
17 // mirror padding) baked into the processing, to optimize latency and memory
18 // usage.
19 
20 #define EIGEN_USE_THREADS
21 
22 #include <string>
23 #include <vector>
24 
25 #include "tensorflow/core/framework/bounds_check.h"
26 #include "tensorflow/core/framework/kernel_shape_util.h"
27 #include "tensorflow/core/framework/numeric_op.h"
28 #include "tensorflow/core/framework/op_kernel.h"
29 #include "tensorflow/core/framework/register_types.h"
30 #include "tensorflow/core/framework/resource_mgr.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/framework/tensor_slice.h"
34 #include "tensorflow/core/kernels/conv_2d.h"
35 #include "tensorflow/core/kernels/conv_ops.h"
36 #include "tensorflow/core/kernels/gemm_functors.h"
37 #include "tensorflow/core/kernels/ops_util.h"
38 #include "tensorflow/core/lib/core/threadpool.h"
39 #include "tensorflow/core/util/image_resizer_state.h"
40 #include "tensorflow/core/util/mirror_pad_mode.h"
41 #include "tensorflow/core/util/padding.h"
42 #include "tensorflow/core/util/tensor_format.h"
43 
44 namespace tensorflow {
45 namespace {
46 
47 // We don't want to allocate a buffer to hold all the patches if the size is
48 // going to be extremely large, so break it into chunks if it's bigger than
49 // a limit. Each chunk will be processed serially, so we can refill the
50 // buffer for the next chunk and reuse it, keeping maximum memory size down.
51 // In this case, we've picked 16 megabytes as a reasonable limit for Android and
52 // other platforms using Eigen, and 1MB for iOS devices, from experimentation.
53 #if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
54 const size_t kMaxChunkSize = (1 * 1024 * 1024);
55 #else
56 const size_t kMaxChunkSize = (16 * 1024 * 1024);
57 #endif
58 const size_t kResizeCacheSize = (8 * 1024 * 1024);
59 
60 // Lookup method used when resizing.
61 enum SamplingMode {
62   BILINEAR = 0,
63   NEAREST = 1,
64 };
65 
66 // Simple utility function used by FusedConv to multithread basic workloads. To
67 // use it, pass begin and end values for the full workload and a std::function
68 // that receives a subset of that through the begin and end values for each
69 // worker's task. The division of the full workload into worker tasks is handled
70 // by the multithreading logic. Here's an example of how to use it:
71 // std::vector<float> my_vector(100);
72 // ...
73 // FusedConvParallelFor(context, 0, 100,
74 //   [&my_vector](int64 task_begin, int64 task_end) {
75 //     for (int64 current = task_begin; current != task_end; ++current) {
76 //       my_vector[current] *= 10.0f;
77 //     }
78 // });
FusedConvParallelFor(OpKernelContext * context,int64 begin,int64 end,const std::function<void (int64,int64)> & task_function)79 void FusedConvParallelFor(
80     OpKernelContext* context, int64 begin, int64 end,
81     const std::function<void(int64, int64)>& task_function) {
82 // On iOS, the thread management imposes a very big performance penalty, so
83 // just call the function directly with no multithreading.
84 #if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
85   task_function(begin, end);
86 #else
87   auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
88   thread::ThreadPool* thread_pool = worker_threads.workers;
89   const int64 total_elements = end - begin;
90   // This is a bit of an arbitrary number, but was found to work well for
91   // typical models we've been profiling on various devices.
92   const int64 element_cost = 10000000;
93   thread_pool->ParallelFor(
94       total_elements, element_cost,
95       [begin, task_function](int64 begin_offset, int64 end_offset) {
96         const int64 task_begin = begin + begin_offset;
97         const int64 task_end = begin + end_offset;
98         task_function(task_begin, task_end);
99       });
100 #endif
101 }
102 
103 // Holds the state needed for the resizing subtasks.
104 template <class T1>
105 struct ResizeTaskParameters {
ResizeTaskParameterstensorflow::__anon794cca7e0111::ResizeTaskParameters106   ResizeTaskParameters() : st(false, false) {}
107 
108   int cache_height;
109   T1* resize_cache;
110   int cache_line_width;
111   int input_width;
112   int input_depth;
113   int top_padding;
114   int pad_offset;
115   int64 resized_height;
116   ImageResizerState st;
117   const T1* input_batch_start;
118   int64 cache_start_x;
119   int64 cache_end_x;
120   int left_padding;
121   int64 resized_width;
122   int64 padded_width;
123   int64 padded_height;
124 };
125 
126 template <class T1>
127 struct PerCacheLineParameters {
PerCacheLineParameterstensorflow::__anon794cca7e0111::PerCacheLineParameters128   PerCacheLineParameters() {}
PerCacheLineParameterstensorflow::__anon794cca7e0111::PerCacheLineParameters129   PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
130       : cache_line_start(other.cache_line_start),
131         input_top_row_start(other.input_top_row_start),
132         input_bottom_row_start(other.input_bottom_row_start),
133         y_lerp(other.y_lerp) {}
134 
135   T1* cache_line_start;
136   const T1* input_top_row_start;
137   const T1* input_bottom_row_start;
138   T1 y_lerp;
139 };
140 
141 // Helper class to simplify bilinear filtering
142 template <class T1>
143 struct SampleRect {
SampleRecttensorflow::__anon794cca7e0111::SampleRect144   EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
145                                  const T1* in_bottom_left,
146                                  const T1* in_bottom_right)
147       : top_left(in_top_left),
148         top_right(in_top_right),
149         bottom_left(in_bottom_left),
150         bottom_right(in_bottom_right) {}
151 
BilinearSampletensorflow::__anon794cca7e0111::SampleRect152   EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
153                                         T1 y_lerp) const {
154     const T1 top =
155         top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
156     const T1 bottom = bottom_left[channel] +
157                       (bottom_right[channel] - bottom_left[channel]) * x_lerp;
158     return top + (bottom - top) * y_lerp;
159   }
160 
161   const T1* top_left;
162   const T1* top_right;
163   const T1* bottom_left;
164   const T1* bottom_right;
165 };
166 
167 // Calculates parameters which remain constant through a resize cache row.
168 template <class T1>
CalculatePerCacheLineParameters(int64 cache_height,int64 cache_y,T1 * resize_cache,int64 cache_line_width,int64 input_width,int64 input_depth,int64 top_padding,int64 pad_offset,int64 resized_height,const ImageResizerState & st,const T1 * input_batch_start)169 EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
170     int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
171     int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
172     int64 resized_height, const ImageResizerState& st,
173     const T1* input_batch_start) {
174   PerCacheLineParameters<T1> result;
175   // The cache is organized so that the real y values of the resized image map
176   // onto the actual cache values through a modulo scheme. This means that as we
177   // progress downwards through the image, we keep reusing a small cache and so
178   // keep memory usage down.
179   int64 cache_index_y;
180   if (cache_y < 0) {
181     cache_index_y = cache_height + (cache_y % cache_height);
182   } else {
183     cache_index_y = cache_y % cache_height;
184   }
185   result.cache_line_start =
186       resize_cache + (cache_index_y * cache_line_width * input_depth);
187   // This part is implementing the mirror padding that happens before resizing.
188   float in_y = (cache_y - top_padding);
189   if (in_y < 0) {
190     in_y = -(in_y + 1.0f - pad_offset);
191   } else if (in_y >= resized_height) {
192     in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
193   }
194   // Here's where to do the actual resize.
195   in_y *= st.height_scale;
196   const int64 top_y_index = static_cast<int64>(std::floor(in_y));
197   const int64 bottom_y_index =
198       std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
199   // Lerp is used for bilinear filtering when that's needed.
200   result.y_lerp = static_cast<T1>(in_y - top_y_index);
201   // Which rows of the original input image to pull the values from.
202   result.input_top_row_start =
203       input_batch_start + (top_y_index * input_width * input_depth);
204   result.input_bottom_row_start =
205       input_batch_start + (bottom_y_index * input_width * input_depth);
206   return result;
207 }
208 
209 template <class T1>
210 struct PerCachePixelParameters {
PerCachePixelParameterstensorflow::__anon794cca7e0111::PerCachePixelParameters211   PerCachePixelParameters() {}
PerCachePixelParameterstensorflow::__anon794cca7e0111::PerCachePixelParameters212   PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
213       : cache_line_pixel(other.cache_line_pixel),
214         left_x_index(other.left_x_index),
215         right_x_index(other.right_x_index),
216         x_lerp(other.x_lerp) {}
217 
218   T1* cache_line_pixel;
219   int64 left_x_index;
220   int64 right_x_index;
221   T1 x_lerp;
222 };
223 
224 // Pulls out common parameters used for every resized pixel.
225 template <class T1>
226 EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
CalculatePerCachePixelParameters(int64 cache_x,int64 cache_start_x,T1 * cache_line_start,int64 input_depth,int64 left_padding,int64 pad_offset,int64 resized_width,const ImageResizerState & st)227 CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
228                                  T1* cache_line_start, int64 input_depth,
229                                  int64 left_padding, int64 pad_offset,
230                                  int64 resized_width,
231                                  const ImageResizerState& st) {
232   PerCachePixelParameters<T1> result;
233   // Figure out where we're going to store the results of our transform.
234   const int cache_index_x = cache_x - cache_start_x;
235   result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
236   // Implement mirror padding by flipping in_x if it's off the edge.
237   float in_x = (cache_x - left_padding);
238   if (in_x < 0) {
239     in_x = -(in_x + 1.0f - pad_offset);
240   } else if (in_x >= resized_width) {
241     in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
242   }
243   // Resize the x parameters.
244   in_x *= st.width_scale;
245   // Get the x coordinates for the left and right pixels to pull from.
246   result.left_x_index = static_cast<int64>(std::floor(in_x));
247   result.right_x_index =
248       std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
249   // This x_lerp is used to blend pixels in bilinear filtering.
250   result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
251   return result;
252 }
253 
254 // Combines bilinear resizing and mirror padding into the im2col transformation
255 // stage of convolution.
256 template <class T1, class T2, class T3, class TGemmFunctor,
257           SamplingMode SampleMode>
258 class FusedResizeAndPadConvFunctor {
259  public:
operator ()(OpKernelContext * context,const Tensor & input,int input_batches,int resized_height,int resized_width,int padded_height,int padded_width,int input_depth,const T2 * filter_data,int filter_height,int filter_width,int filter_count,int stride_rows,int stride_cols,Padding padding,T3 * output_data,int output_height,int output_width,const ImageResizerState & st,int top_padding,int bottom_padding,int left_padding,int right_padding,int pad_offset)260   void operator()(OpKernelContext* context, const Tensor& input,
261                   int input_batches, int resized_height, int resized_width,
262                   int padded_height, int padded_width, int input_depth,
263                   const T2* filter_data, int filter_height, int filter_width,
264                   int filter_count, int stride_rows, int stride_cols,
265                   Padding padding, T3* output_data, int output_height,
266                   int output_width, const ImageResizerState& st,
267                   int top_padding, int bottom_padding, int left_padding,
268                   int right_padding, int pad_offset) {
269     if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
270         (input_depth <= 0)) {
271       LOG(WARNING) << "Conv2D was called with bad input dimensions: "
272                    << input_batches << ", " << padded_height << ", "
273                    << padded_width << ", " << input_depth;
274       return;
275     }
276     if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
277       LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
278                    << filter_width << ", " << filter_height << ", "
279                    << filter_count;
280       return;
281     }
282     if ((output_width <= 0) || (output_height <= 0)) {
283       LOG(WARNING) << "Conv2D was called with bad output width or height: "
284                    << output_width << ", " << output_height;
285       return;
286     }
287     OP_REQUIRES(
288         context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
289         errors::InvalidArgument("Bad sample mode passed in", SampleMode));
290 
291     // These calculations define how the patches will be positioned within the
292     // input image. The actual definitions are quite complex, and rely on the
293     // previously-calculated output size.
294     int filter_left_offset;
295     int filter_top_offset;
296     if (padding == VALID) {
297       filter_left_offset =
298           ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
299           2;
300       filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
301                            padded_height + 1) /
302                           2;
303     } else {
304       filter_left_offset =
305           ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
306       filter_top_offset =
307           ((output_height - 1) * stride_rows + filter_height - padded_height) /
308           2;
309     }
310 
311     ResizeTaskParameters<T1> task_params;
312     task_params.input_depth = input_depth;
313     task_params.top_padding = top_padding;
314     task_params.pad_offset = pad_offset;
315     task_params.resized_height = resized_height;
316     task_params.st = st;
317     task_params.left_padding = left_padding;
318     task_params.resized_width = resized_width;
319     task_params.padded_width = padded_width;
320     task_params.padded_height = padded_height;
321 
322     // The im2col buffer has # of patches rows, and # of filters cols.
323     // It's laid out like this, in row major order in memory:
324     //        < filter value count >
325     //   ^   +---------------------+
326     // patch |                     |
327     // count |                     |
328     //   v   +---------------------+
329     // Each patch row contains a filter_width x filter_height patch of the
330     // input, with the depth channel as the most contiguous in memory, followed
331     // by the width, then the height. This is the standard memory order in the
332     // image world if it helps to visualize it.
333     const int filter_value_count = filter_width * filter_height * input_depth;
334 
335     OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
336                 errors::InvalidArgument("Im2Col patch too large for buffer"));
337     const size_t patches_per_chunk =
338         kMaxChunkSize / (filter_value_count * sizeof(T1));
339     // Because memory allocation is very expensive on mobile platforms, try to
340     // allocate a persistent buffer that will be kept around between calls. We
341     // use TensorFlow's resource management to ensure that the memory will be
342     // released when the session is over.
343     Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
344     std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
345         [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
346           *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
347           return Status::OK();
348         };
349     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
350                                 "Conv2d", "im2col_buffer",
351                                 &im2col_buffer_resource, creator));
352 
353     // Create a resize cache memory buffer that will hold the rows of
354     // transformed and mirror padded input pixels, ready to be copied
355     // into filter patches by im2col.
356     // It's laid out like this, in row major order in memory:
357     //         < cache line width >
358     //   ^    +--------------------+
359     // cache  |                    |
360     // height |                    |
361     //   v    +--------------------+
362     // Each cache row contains a cache_line_width number of resized pixels,
363     // each with input_depth channels. The cache height is typically less than
364     // the full height the resized image would be, so it's filled up
365     // incrementally as we progress downwards through the input creating im2col
366     // patches.
367     task_params.cache_start_x = -filter_left_offset;
368     task_params.cache_end_x =
369         (((output_width - 1) * stride_cols) - filter_left_offset) +
370         filter_width;
371     task_params.cache_line_width =
372         task_params.cache_end_x - task_params.cache_start_x;
373     task_params.cache_height =
374         kResizeCacheSize / (task_params.cache_line_width * input_depth);
375     const int needed_resize_cache_count =
376         filter_height * task_params.cache_line_width * input_depth;
377     OP_REQUIRES(context,
378                 (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
379                 errors::InvalidArgument("Input too large for resize cache"));
380     Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
381     std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
382         resize_creator =
383             [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
384               *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
385               return Status::OK();
386             };
387     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
388                                 "Conv2d", "resize_cache",
389                                 &resize_cache_resource, resize_creator));
390 
391     // This means that multiple ops can't be run simultaneously on different
392     // threads, because we have a single shared resource. The platforms this is
393     // aimed at have intra-op parallelism as their focus though, so it shouldn't
394     // be an issue.
395     mutex_lock lock_buffer(im2col_buffer_resource->mu);
396     core::ScopedUnref unref_buffer(im2col_buffer_resource);
397     T1* im2col_buffer = im2col_buffer_resource->data;
398 
399     // This buffer is used as a fairly heavy-weight cache for the resized and
400     // mirrored inputs to the im2col operation. The problem is that we want to
401     // keep the memory usage down by not rendering the fully resized and padded
402     // input tensor to the convolution into an entire buffer. The first approach
403     // to avoid this was to fold the bilinear filtering and padding spatial
404     // transformations into the im2col lookup itself. This successfully reduced
405     // memory usage, but because im2col can access an individual pixel for many
406     // different patches, the extra overhead of doing the same bilinear lookups
407     // repeatedly became too expensive.
408     // The resize cache is designed to avoid this problem by keeping a
409     // horizontal slice of the resized and padded input to the im2col
410     // precalculated, so that repeated accesses to the same pixel from different
411     // filter patches can just be copied from this cache. It's organized as a
412     // horizontal slice stretching across the whole virtual image, and as high
413     // as the filter window, so that as the patch processing moves across all
414     // the pixels are present, and before a new row of patches is started any
415     // previously calculated rows that are needed are maintained, with new rows
416     // calculated as required.
417     mutex_lock resize_lock_buffer(resize_cache_resource->mu);
418     core::ScopedUnref unref_resized_cache(resize_cache_resource);
419     task_params.resize_cache = resize_cache_resource->data;
420 
421     const T1* input_data = input.flat<T1>().data();
422     const int64 input_height = input.shape().dim_sizes()[1];
423     task_params.input_width = input.shape().dim_sizes()[2];
424 
425     int end_cached_lines = std::numeric_limits<int>::min();
426 
427     for (int batch = 0; batch < input_batches; ++batch) {
428       task_params.input_batch_start =
429           input_data +
430           (batch * input_height * task_params.input_width * input_depth);
431       const int in_y_end =
432           ((output_height * stride_rows) - filter_top_offset) + filter_height;
433       for (int out_y = 0; out_y < output_height; ++out_y) {
434         const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
435         const int cache_start_y = std::max(in_y_origin, end_cached_lines);
436         const int cache_end_y = std::min(
437             in_y_end, std::max((in_y_origin + task_params.cache_height),
438                                end_cached_lines));
439         if (end_cached_lines < (in_y_origin + filter_height)) {
440           // This call breaks up the work required for calculating the mirror
441           // padding and resizing across multiple threads.
442           FusedConvParallelFor(
443               context, cache_start_y, cache_end_y,
444               [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
445                 // This is a long and confusing function, but it's been laid out
446                 // this way to help with performance on some intensive models.
447                 // What it's doing is populating a cache of the original input
448                 // image, after it's been bilinear resized and had its edges
449                 // mirrored. This allows the following im2col code to access the
450                 // transformed pixels from this cache, without having to
451                 // repeatedly apply the expensive bilinear calculations as the
452                 // same pixels are accessed by different patches.
453                 // This is most effective when the stride is small and the
454                 // filter size is large, since that's when pixels are reused
455                 // most frequently as patches overlap.
456                 for (int cache_y = task_cache_start_y;
457                      cache_y < task_cache_end_y; ++cache_y) {
458                   // We organize the cache as a series of rows, each containing
459                   // all the transformed pixels for a given line in the image.
460                   // This cache is big enough to hold at least a filter's height
461                   // worth of rows, but typically more, limited by the size of
462                   // the cache buffer.
463                   // We don't allocate an entire image's worth of rows though,
464                   // because we're trying to keep memory usage down, so as we
465                   // progress downwards through the im2col we periodically
466                   // refresh the cache so that the next lines that are needed
467                   // for that operation are always present.
468                   // Work out the parameters that remain constant across the
469                   // row we're calculating.
470                   PerCacheLineParameters<T1> line_params(
471                       CalculatePerCacheLineParameters<T1>(
472                           task_params.cache_height, cache_y,
473                           task_params.resize_cache,
474                           task_params.cache_line_width, task_params.input_width,
475                           task_params.input_depth, task_params.top_padding,
476                           task_params.pad_offset, task_params.resized_height,
477                           task_params.st, task_params.input_batch_start));
478                   // Iterate through the resize cache row we're filling in.
479                   for (int cache_x = task_params.cache_start_x;
480                        cache_x < task_params.cache_end_x; ++cache_x) {
481                     // Figure out what we need for the cache pixel we're
482                     // populating.
483                     PerCachePixelParameters<T1> pixel_params(
484                         CalculatePerCachePixelParameters<T1>(
485                             cache_x, task_params.cache_start_x,
486                             line_params.cache_line_start,
487                             task_params.input_depth, task_params.left_padding,
488                             task_params.pad_offset, task_params.resized_width,
489                             task_params.st));
490                     // If the access is off the left, right, top, or bottom of
491                     // the resized image, the conv padding means we should set
492                     // it to zero.
493                     if ((cache_x < 0) ||
494                         (cache_x >= task_params.padded_width) ||
495                         (cache_y < 0) ||
496                         (cache_y >= task_params.padded_height)) {
497                       std::fill_n(pixel_params.cache_line_pixel,
498                                   task_params.input_depth, T1(0));
499                     } else {
500                       // There are two different sampling strategies for
501                       // resizing. When using nearest, we can just do a
502                       // straight copy of the pixel closest to our sample point,
503                       // but bilinear requires a more complex calculation.
504                       if (SampleMode == NEAREST) {
505                         const T1* input_top_left_pixel =
506                             line_params.input_top_row_start +
507                             (pixel_params.left_x_index *
508                              task_params.input_depth);
509 
510                         std::copy_n(input_top_left_pixel,
511                                     task_params.input_depth,
512                                     pixel_params.cache_line_pixel);
513                       } else {
514                         const SampleRect<T1> rect(
515                             line_params.input_top_row_start +
516                                 (pixel_params.left_x_index *
517                                  task_params.input_depth),
518                             line_params.input_top_row_start +
519                                 (pixel_params.right_x_index *
520                                  task_params.input_depth),
521                             line_params.input_bottom_row_start +
522                                 (pixel_params.left_x_index *
523                                  task_params.input_depth),
524                             line_params.input_bottom_row_start +
525                                 (pixel_params.right_x_index *
526                                  task_params.input_depth));
527                         for (int in_channel = 0;
528                              in_channel < task_params.input_depth;
529                              ++in_channel) {
530                           pixel_params.cache_line_pixel[in_channel] =
531                               rect.BilinearSample(in_channel,
532                                                   pixel_params.x_lerp,
533                                                   line_params.y_lerp);
534                         }
535                       }
536                     }
537                   }
538                 }
539               });
540           end_cached_lines = cache_end_y;
541         }
542         for (int out_x = 0; out_x < output_width; ++out_x) {
543           const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
544           const int patch_index = (batch * output_width * output_height) +
545                                   (out_y * output_width) + out_x;
546           const int patch_index_within_chunk = patch_index % patches_per_chunk;
547           T1* im2col_patch_start =
548               im2col_buffer + (patch_index_within_chunk * filter_value_count);
549           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
550             T1* im2col_row_start =
551                 im2col_patch_start +
552                 (filter_y * filter_width * task_params.input_depth);
553             const int conv_in_y = in_y_origin + filter_y;
554             int cache_index_y;
555             if (conv_in_y < 0) {
556               cache_index_y = task_params.cache_height +
557                               (conv_in_y % task_params.cache_height);
558             } else {
559               cache_index_y = conv_in_y % task_params.cache_height;
560             }
561             T1* cache_line_start =
562                 task_params.resize_cache +
563                 (cache_index_y * task_params.cache_line_width *
564                  task_params.input_depth);
565             T1* cache_filter_row_start =
566                 cache_line_start + ((in_x_origin - task_params.cache_start_x) *
567                                     task_params.input_depth);
568             std::copy_n(cache_filter_row_start,
569                         (filter_width * task_params.input_depth),
570                         im2col_row_start);
571           }
572           const bool is_last_in_chunk =
573               (patch_index_within_chunk == (patches_per_chunk - 1));
574           const bool is_last_overall =
575               ((batch == (input_batches - 1)) &&
576                (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
577           if (is_last_in_chunk || is_last_overall) {
578             // Now we've assembled a set of image patches into a matrix, apply
579             // a GEMM matrix multiply of the patches as rows, times the filter
580             // weights in columns, to get partial results in the output
581             // matrix.
582             const int how_many_patches = patch_index_within_chunk + 1;
583             const int m = how_many_patches;
584             const int n = filter_count;
585             const int k = filter_value_count;
586             const int lda = filter_value_count;
587             const int ldb = filter_count;
588             const int ldc = filter_count;
589             const size_t start_patch_index =
590                 patch_index - (how_many_patches - 1);
591             T3* chunk_output_data =
592                 output_data + (start_patch_index * filter_count);
593             TGemmFunctor gemm_functor;
594             gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
595                          chunk_output_data, ldc);
596           }
597         }
598       }
599     }
600   }
601 };
602 
603 }  // namespace
604 
605 // Implements a version of convolution with bilinear resizing and mirror padding
606 // included.
607 template <class T, class TConvFunctor, bool DoResize>
608 class FusedResizeConv2DUsingGemmOp : public OpKernel {
609  public:
FusedResizeConv2DUsingGemmOp(OpKernelConstruction * context)610   explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
611       : OpKernel(context) {
612     if (DoResize) {
613       OP_REQUIRES_OK(context,
614                      context->GetAttr("resize_align_corners", &align_corners_));
615     }
616     MirrorPadMode mode;
617     OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
618 
619     switch (mode) {
620       case MirrorPadMode::SYMMETRIC: {
621         offset_ = 0;
622         break;
623       }
624       case MirrorPadMode::REFLECT: {
625         offset_ = 1;
626         break;
627       }
628       default:
629         OP_REQUIRES(context, false,
630                     errors::InvalidArgument(
631                         "mode must be either REFLECT or SYMMETRIC."));
632     }
633     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
634     OP_REQUIRES(context, strides_.size() == 4,
635                 errors::InvalidArgument("Sliding window strides field must "
636                                         "specify 4 dimensions"));
637     const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
638     const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
639     OP_REQUIRES(
640         context, stride_n == 1 && stride_c == 1,
641         errors::InvalidArgument("Current implementation does not yet support "
642                                 "strides in the batch and depth dimensions."));
643     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
644   }
645 
Compute(OpKernelContext * context)646   void Compute(OpKernelContext* context) override {
647     // Input tensor is of the following dimensions:
648     // [ batch, in_rows, in_cols, in_depth ]
649     const Tensor& input = context->input(0);
650     OP_REQUIRES(context, (input.shape().num_elements() > 0),
651                 errors::InvalidArgument("Input tensor can't be empty"));
652 
653     ImageResizerState st(false, false);
654     if (DoResize) {
655       st = ImageResizerState(align_corners_, false);
656       st.ValidateAndCalculateOutputSize(context, input);
657       if (!context->status().ok()) return;
658     } else {
659       // Set up the resize parameters to do no scaling at all.
660       st.batch_size = input.dim_size(0);
661       st.out_height = input.dim_size(1);
662       st.out_width = input.dim_size(2);
663       st.in_height = input.dim_size(1);
664       st.in_width = input.dim_size(2);
665       st.channels = input.dim_size(3);
666       st.height_scale = 1.0f;
667       st.width_scale = 1.0f;
668     }
669     TensorShape resized_shape(
670         {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
671     int paddings_index;
672     int filter_index;
673     if (DoResize) {
674       paddings_index = 2;
675       filter_index = 3;
676     } else {
677       paddings_index = 1;
678       filter_index = 2;
679     }
680     const Tensor& paddings = context->input(paddings_index);
681 
682     const int dims = resized_shape.dims();
683     OP_REQUIRES(
684         context,
685         TensorShapeUtils::IsMatrix(paddings.shape()) &&
686             paddings.dim_size(1) == 2,
687         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
688                                 paddings.shape().DebugString()));
689     OP_REQUIRES(
690         context, dims == paddings.dim_size(0),
691         errors::InvalidArgument(
692             "The first dimension of paddings must be the rank of inputs: ",
693             dims, " ", paddings.shape().DebugString(), " ",
694             resized_shape.DebugString()));
695     OP_REQUIRES(
696         context, dims == paddings.dim_size(0),
697         errors::InvalidArgument(
698             "The first dimension of paddings must be the rank of inputs: ",
699             dims, " ", paddings.shape().DebugString(), " ",
700             resized_shape.DebugString()));
701 
702     OP_REQUIRES(
703         context, dims == 4,
704         errors::InvalidArgument(
705             "Fused mirror padding only supports four-dimensional inputs, but ",
706             dims, " requested"));
707 
708     // Compute the shape of the output tensor, and allocate it.
709     TensorShape padded_shape;
710     TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
711     for (int d = 0; d < dims; ++d) {
712       const int32 before =
713           paddings_matrix(d, 0);  // Pad before existing elements.
714       const int32 after =
715           paddings_matrix(d, 1);  // Pad after existing elements.
716       OP_REQUIRES(context, before >= 0 && after >= 0,
717                   errors::InvalidArgument(
718                       "paddings must be non-negative: ", before, " ", after));
719       if (offset_ == 0) {  // SYMMETRIC mode.
720         OP_REQUIRES(
721             context,
722             before <= resized_shape.dim_size(d) &&
723                 after <= resized_shape.dim_size(d),
724             errors::InvalidArgument("paddings must be no greater "
725                                     "than the dimension size: ",
726                                     before, ", ", after, " greater than ",
727                                     resized_shape.dim_size(d)));
728       } else if (offset_ == 1) {  // REFLECT mode.
729         OP_REQUIRES(
730             context,
731             before < resized_shape.dim_size(d) &&
732                 after < resized_shape.dim_size(d),
733             errors::InvalidArgument("paddings must be less than"
734                                     " the dimension size: ",
735                                     before, ", ", after, " not less than ",
736                                     resized_shape.dim_size(d)));
737       }
738       padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
739     }
740 
741     OP_REQUIRES(
742         context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
743         errors::InvalidArgument(
744             "Fused mirror padding only support spatial padding, not batches: ",
745             paddings.DebugString()));
746     OP_REQUIRES(
747         context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
748         errors::InvalidArgument(
749             "Fused mirror padding only support spatial padding, not channels: ",
750             paddings.DebugString()));
751     const int32 top_padding = paddings_matrix(1, 0);
752     const int32 bottom_padding = paddings_matrix(1, 1);
753     const int32 left_padding = paddings_matrix(2, 0);
754     const int32 right_padding = paddings_matrix(2, 1);
755 
756     // Input filter is of the following dimensions:
757     // [ filter_rows, filter_cols, in_depth, out_depth]
758     const Tensor& filter = context->input(filter_index);
759 
760     // For 2D convolution, there should be 4 dimensions.
761     OP_REQUIRES(context, padded_shape.dims() == 4,
762                 errors::InvalidArgument("input must be 4-dimensional",
763                                         padded_shape.DebugString()));
764     OP_REQUIRES(context, filter.dims() == 4,
765                 errors::InvalidArgument("filter must be 4-dimensional: ",
766                                         filter.shape().DebugString()));
767 
768     // We only check the first three dims, since the depth is accessed as an
769     // int64 below.
770     for (int i = 0; i < 3; i++) {
771       OP_REQUIRES(
772           context,
773           FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
774           errors::InvalidArgument("filter too large"));
775     }
776 
777     // The last dimension for input is in_depth. It must be the same as the
778     // filter's in_depth.
779     const int64 in_depth = padded_shape.dim_size(3);
780     OP_REQUIRES(context, in_depth == filter.dim_size(2),
781                 errors::InvalidArgument(
782                     "input and filter must have the same depth: ", in_depth,
783                     " vs ", filter.dim_size(2)));
784 
785     // The last dimension for filter is out_depth.
786     const int out_depth = static_cast<int>(filter.dim_size(3));
787 
788     // The second dimension for input is rows/height.
789     // The first dimension for filter is rows/height.
790     const int64 padded_rows_raw = padded_shape.dim_size(1);
791     OP_REQUIRES(
792         context,
793         FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
794         errors::InvalidArgument("Input rows too large"));
795     const int padded_rows = static_cast<int>(padded_rows_raw);
796     const int filter_rows = static_cast<int>(filter.dim_size(0));
797     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
798 
799     // The third dimension for input is columns/width.
800     // The second dimension for filter is columns/width.
801     const int64 padded_cols_raw = padded_shape.dim_size(2);
802     OP_REQUIRES(
803         context,
804         FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
805         errors::InvalidArgument("Input cols too large"));
806     const int padded_cols = static_cast<int>(padded_cols_raw);
807     const int filter_cols = static_cast<int>(filter.dim_size(1));
808     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
809 
810     // The first dimension for input is batch.
811     const int64 batch_raw = padded_shape.dim_size(0);
812     OP_REQUIRES(context,
813                 FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
814                 errors::InvalidArgument("batch is too large"));
815     const int batch = static_cast<int>(batch_raw);
816 
817     // For now we take the stride from the second and third dimensions only (we
818     // do not support striding on the batch or depth dimension).
819     const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
820     const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
821 
822     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
823     OP_REQUIRES_OK(context,
824                    GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
825                                          padding_, &out_rows, &pad_rows));
826     OP_REQUIRES_OK(context,
827                    GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
828                                          padding_, &out_cols, &pad_cols));
829     TensorShape out_shape =
830         ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
831     OP_REQUIRES(context, (out_shape.num_elements() > 0),
832                 errors::InvalidArgument("Output tensor can't be empty"));
833 
834     // Output tensor is of the following dimensions:
835     // [ in_batch, out_rows, out_cols, out_depth ]
836     Tensor* output = nullptr;
837     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
838 
839     VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
840             << ", padded_cols = " << padded_cols
841             << ", resized_cols = " << resized_cols
842             << ", filter_cols = " << filter_cols
843             << ", padded_rows = " << padded_rows
844             << ", resized_rows = " << resized_rows
845             << ", filter_rows = " << filter_rows
846             << ", stride_rows = " << stride_rows
847             << ", stride_cols = " << stride_cols
848             << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
849 
850     // If there is nothing to compute, return.
851     if (out_shape.num_elements() == 0) {
852       return;
853     }
854     TConvFunctor conv_functor;
855     conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
856                  padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
857                  filter_cols, out_depth, stride_rows, stride_cols, padding_,
858                  output->flat<T>().data(), out_rows, out_cols, st, top_padding,
859                  bottom_padding, left_padding, right_padding, offset_);
860   }
861 
862  private:
863   std::vector<int32> strides_;
864   Padding padding_;
865   bool align_corners_;
866   int offset_;
867 
868   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
869 };
870 
871 #define REGISTER_FUSED(T)                                                 \
872   REGISTER_KERNEL_BUILDER(                                                \
873       Name("FusedResizeAndPadConv2D")                                     \
874           .Device(DEVICE_CPU)                                             \
875           .TypeConstraint<T>("T"),                                        \
876       FusedResizeConv2DUsingGemmOp<                                       \
877           T,                                                              \
878           FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
879                                        BILINEAR>,                         \
880           true>);
881 
882 TF_CALL_half(REGISTER_FUSED);
883 TF_CALL_float(REGISTER_FUSED);
884 TF_CALL_double(REGISTER_FUSED);
885 
886 #define REGISTER_PAD_ONLY_FUSED(T)                                        \
887   REGISTER_KERNEL_BUILDER(                                                \
888       Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
889       FusedResizeConv2DUsingGemmOp<                                       \
890           T,                                                              \
891           FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
892                                        NEAREST>,                          \
893           false>);
894 
895 TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
896 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
897 TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
898 
899 }  // namespace tensorflow
900