1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <unistd.h>
16 #include <algorithm>
17 #include <cassert>
18 #include <cmath>
19 #include <cstdio>
20 #include <cstdlib>
21 #include <iostream>
22 #include <limits>
23 
24 #include "tensorflow/contrib/lite/builtin_op_data.h"
25 #include "tensorflow/contrib/lite/context.h"
26 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
27 #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
28 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
29 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
30 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
31 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
32 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
33 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
34 #include "tensorflow/contrib/lite/kernels/op_macros.h"
35 #include "tensorflow/contrib/lite/kernels/padding.h"
36 
37 namespace tflite {
38 namespace ops {
39 namespace builtin {
40 namespace conv {
41 
42 // This file has 4 implementation of Conv.
43 enum KernelType {
44   kReference,
45   kGenericOptimized,  // Neon-free
46   kMultithreadOptimized,
47   // The kernel uses use CBLAS interface for matrix multiplication.
48   // It's fast when an optimized CBLAS implementation is available (e.g. Apple
49   // Accelerate Framework), and it's slow when falling back to naive
50   // implementation.
51   kCblasOptimized,
52 };
53 
54 struct OpData {
55   // IDs are the arbitrary identifiers used by TF Lite to identify and access
56   // memory buffers.
57   int im2col_id;
58   int hwcn_weights_id;
59 
60   TfLitePaddingValues padding;
61   // The scaling factor from input to output (aka the 'real multiplier') can
62   // be represented as a fixed point multipler plus a left shift.
63   int32_t output_multiplier;
64   int output_shift;
65   // The range of the fused activation layer. For example for kNone and
66   // uint8_t these would be 0 and 255.
67   int32_t output_activation_min;
68   int32_t output_activation_max;
69   // Indexes are the offset to the memory buffer in the array used to keep track
70   // of the allocated temporaries.
71   int32_t im2col_index;
72   int32_t hwcn_weights_index;
73   bool need_hwcn_weights;
74   bool have_weights_been_transposed;
75   bool need_im2col;
76 };
77 
Init(TfLiteContext * context,const char * buffer,size_t length)78 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
79   // This is a builtin op, so we don't use the contents in 'buffer', if any.
80   // Instead, we allocate a new object to use as scratch space for im2col, and
81   // to carry information from Prepare() to Eval().
82   auto* data = new OpData;
83   context->AddTensors(context, 1, &data->im2col_id);
84   context->AddTensors(context, 1, &data->hwcn_weights_id);
85   gemm_support::IncrementUsageCounter(context);
86   return data;
87 }
88 
Free(TfLiteContext * context,void * buffer)89 void Free(TfLiteContext* context, void* buffer) {
90   gemm_support::DecrementUsageCounter(context);
91   delete reinterpret_cast<OpData*>(buffer);
92 }
93 
94 // Naive implementation of transpose for floats. Could be optimized to be more
95 // cache friendly, but for now it's a one-time cost on first run, and we would
96 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(TfLiteTensor * input,TfLiteTensor * output)97 void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
98   const int rows = output->dims->data[1];
99   const int cols = output->dims->data[0];
100   const float* input_data = GetTensorData<float>(input);
101   float* output_data = GetTensorData<float>(output);
102   for (int i = 0; i < rows; ++i) {
103     for (int j = 0; j < cols; ++j) {
104       const float in_value = input_data[i * cols + j];
105       output_data[j * rows + i] = in_value;
106     }
107   }
108 }
109 
Prepare(TfLiteContext * context,TfLiteNode * node)110 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
111   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
112   OpData* data = reinterpret_cast<OpData*>(node->user_data);
113 
114   bool hasBias = node->inputs->size == 3;
115   // Check number of inputs/outputs
116   TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
117   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
118   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
119   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
120   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
121   // Check dimensionality of input, filter
122   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
123   TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
124   // Check input channels matching filter
125   TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
126 
127   // Check types. (We assume that UINT8 refers to quantized tensors)
128   TfLiteType data_type = input->type;
129   TF_LITE_ENSURE(context,
130                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
131   TF_LITE_ENSURE_EQ(context, output->type, data_type);
132   TF_LITE_ENSURE_EQ(context, filter->type, data_type);
133 
134   TfLiteTensor* bias = nullptr;
135 
136   // TODO(ahentz): At this point the optimized versions require 'bias'. We can
137   // either change that or document that convolution requires it.
138   TF_LITE_ENSURE(context, hasBias);
139 
140   if (hasBias) {
141     bias = &context->tensors[node->inputs->data[2]];
142     if (data_type == kTfLiteUInt8) {
143       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
144       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
145     } else {
146       TF_LITE_ENSURE_EQ(context, bias->type, data_type);
147     }
148     TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
149     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
150   }
151 
152   int channels_out = filter->dims->data[0];
153   int width = input->dims->data[2];
154   int height = input->dims->data[1];
155   int filter_width = filter->dims->data[2];
156   int filter_height = filter->dims->data[1];
157   int batches = input->dims->data[0];
158 
159   // Matching GetWindowedOutputSize in TensorFlow.
160   auto padding = params->padding;
161   auto computeOutSize = [padding](int imageSize, int filterSize,
162                                   int stride) -> int {
163     return padding == kTfLitePaddingSame
164                ? (imageSize + stride - 1) / stride
165                : padding == kTfLitePaddingValid
166                      ? (imageSize - filterSize + stride) / stride
167                      : 0;
168   };
169 
170   int outWidth = computeOutSize(width, filter_width, params->stride_width);
171   int outHeight = computeOutSize(height, filter_height, params->stride_height);
172 
173   data->padding.height =
174       ComputePadding(params->stride_height, height, filter_height, outHeight);
175   data->padding.width =
176       ComputePadding(params->stride_width, width, filter_width, outWidth);
177 
178   TF_LITE_ENSURE(context, hasBias);
179 
180   // Note that quantized inference requires that all tensors have their
181   // parameters set. This is usually done during quantized training.
182   if (data_type != kTfLiteFloat32) {
183     double real_multiplier = 0.0;
184     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
185         context, input, filter, bias, output, &real_multiplier));
186     QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
187                                      &data->output_shift);
188     CalculateActivationRangeUint8(params->activation, output,
189                                   &data->output_activation_min,
190                                   &data->output_activation_max);
191   }
192 
193   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
194   output_size->data[0] = batches;
195   output_size->data[1] = outHeight;
196   output_size->data[2] = outWidth;
197   output_size->data[3] = channels_out;
198   auto output_status = context->ResizeTensor(context, output, output_size);
199 
200   if (output_status != kTfLiteOk) return output_status;
201 
202   // We don't always need to allocate im2col. It is only used in some versions
203   // of the optimized Conv. This test just mimics something that happens inside
204   // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
205   data->need_im2col =
206       (params->stride_width != 1 || params->stride_height != 1 ||
207        filter_width != 1 || filter_height != 1);
208   // If we're using the optimized multithreaded EigenTensor implementation of
209   // convolution, it expects the filter weights to be transposed compared to
210   // the normal TF Lite buffer format. Typical TF Lite weights are
211   // [filter_count, filter_height, filter_width, input_depth], but for the float
212   // implementation we need them as [filter_height, filter_width, input_depth,
213   // filter_count]. We get to that format by transposing, and create a temporary
214   // buffer to store the results.
215   // This path is only used for float processing, so only create the buffer if
216   // we're running with that data type.
217   data->need_hwcn_weights = (data_type == kTfLiteFloat32);
218 
219   int temporaries_count = 0;
220   if (data->need_im2col) {
221     data->im2col_index = temporaries_count;
222     ++temporaries_count;
223   }
224   if (data->need_hwcn_weights) {
225     data->hwcn_weights_index = temporaries_count;
226     ++temporaries_count;
227   }
228 
229   TfLiteIntArrayFree(node->temporaries);
230   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
231 
232   if (data->need_im2col) {
233     node->temporaries->data[data->im2col_index] = data->im2col_id;
234 
235     TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
236 
237     int input_depth = input->dims->data[3];
238     im2col_size->data[0] = output_size->data[0];
239     im2col_size->data[1] = output_size->data[1];
240     im2col_size->data[2] = output_size->data[2];
241     im2col_size->data[3] = input_depth * filter_height * filter_width;
242 
243     TfLiteTensor* im2col =
244         &context->tensors[node->temporaries->data[data->im2col_index]];
245     im2col->type = data_type;
246     im2col->allocation_type = kTfLiteArenaRw;
247     auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
248     if (im2col_status != kTfLiteOk) return im2col_status;
249   }
250 
251   if (data->need_hwcn_weights) {
252     node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
253     TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
254 
255     // Because we're treating the filter weights as a matrix when we do the
256     // transpose, we allocate the buffer with a two-dimensional shape, where one
257     // dimension is the number of elements in each filter, and the second is the
258     // total number of filters.
259     int input_depth = input->dims->data[3];
260     hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
261     hwcn_weights_size->data[1] = channels_out;
262 
263     TfLiteTensor* hwcn_weights =
264         &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
265     hwcn_weights->type = data_type;
266     hwcn_weights->allocation_type = kTfLiteDynamic;
267     // Make sure we release any previous allocations before we reallocate.
268     // TODO(petewarden): Persistent arenas would be a better fit for this, but
269     // they aren't fully implemented yet.
270     if (hwcn_weights->data.raw) {
271       free(hwcn_weights->data.raw);
272       hwcn_weights->data.raw = nullptr;
273     }
274 
275     // Note that hwcn_weights_status is a kTfLiteDynamic tensor, and
276     // ResizeTensor will actually allocate space for it. The would be more
277     // efficient if we placed hwcn_weights_status in the persistent arena.
278     auto hwcn_weights_status =
279         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
280     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
281 
282     // TODO(petewarden): If Resize() is called when the size hasn't actually
283     // changed, this will do extra redundant work.
284     data->have_weights_been_transposed = false;
285   }
286 
287   return kTfLiteOk;
288 }
289 
290 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)291 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
292                    TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
293                    TfLiteTensor* filter, TfLiteTensor* bias,
294                    TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
295                    TfLiteTensor* output) {
296   gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
297 
298   auto input_offset = -input->params.zero_point;
299   auto filter_offset = -filter->params.zero_point;
300   auto output_offset = output->params.zero_point;
301 
302   switch (kernel_type) {
303     case kReference:
304       reference_ops::Conv(
305           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
306           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
307           GetTensorData<int32_t>(bias), GetTensorDims(bias),
308           params->stride_width, params->stride_height, data->padding.width,
309           data->padding.height, output_offset, data->output_multiplier,
310           data->output_shift, data->output_activation_min,
311           data->output_activation_max, GetTensorData<uint8_t>(output),
312           GetTensorDims(output), GetTensorData<uint8_t>(im2col),
313           GetTensorDims(im2col), gemm_context);
314       break;
315     case kGenericOptimized:
316     case kMultithreadOptimized:
317     case kCblasOptimized:
318       // There is only one optimized implementation for Quantized Conv.
319       optimized_ops::Conv(
320           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
321           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
322           GetTensorData<int32_t>(bias), GetTensorDims(bias),
323           params->stride_width, params->stride_height, data->padding.width,
324           data->padding.height, output_offset, data->output_multiplier,
325           data->output_shift, data->output_activation_min,
326           data->output_activation_max, GetTensorData<uint8_t>(output),
327           GetTensorDims(output), GetTensorData<uint8_t>(im2col),
328           GetTensorDims(im2col), gemm_context);
329       break;
330   }
331 }
332 
333 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)334 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
335                TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
336                TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
337                TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
338   float output_activation_min, output_activation_max;
339   CalculateActivationRangeFloat(params->activation, &output_activation_min,
340                                 &output_activation_max);
341 
342   switch (kernel_type) {
343     case kReference: {
344       reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
345                           GetTensorData<float>(filter), GetTensorDims(filter),
346                           GetTensorData<float>(bias), GetTensorDims(bias),
347                           params->stride_width, params->stride_height,
348                           data->padding.width, data->padding.height,
349                           output_activation_min, output_activation_max,
350                           GetTensorData<float>(output), GetTensorDims(output),
351                           GetTensorData<float>(im2col), GetTensorDims(im2col));
352       break;
353     }
354     case kGenericOptimized: {
355       optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
356                           GetTensorData<float>(filter), GetTensorDims(filter),
357                           GetTensorData<float>(bias), GetTensorDims(bias),
358                           params->stride_width, params->stride_height,
359                           data->padding.width, data->padding.height,
360                           output_activation_min, output_activation_max,
361                           GetTensorData<float>(output), GetTensorDims(output),
362                           GetTensorData<float>(im2col), GetTensorDims(im2col));
363       break;
364     }
365     case kMultithreadOptimized: {
366       const float* filter_data;
367       if (data->need_hwcn_weights) {
368         filter_data = GetTensorData<float>(hwcn_weights);
369       } else {
370         filter_data = GetTensorData<float>(filter);
371       }
372       multithreaded_ops::Conv(
373           GetTensorData<float>(input), GetTensorDims(input), filter_data,
374           GetTensorDims(filter), GetTensorData<float>(bias),
375           GetTensorDims(bias), params->stride_width, params->stride_height,
376           data->padding.width, data->padding.height, params->padding,
377           output_activation_min, output_activation_max,
378           GetTensorData<float>(output), GetTensorDims(output),
379           GetTensorData<float>(im2col), GetTensorDims(im2col));
380       break;
381     }
382     case kCblasOptimized: {
383       cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
384                       GetTensorData<float>(filter), GetTensorDims(filter),
385                       GetTensorData<float>(bias), GetTensorDims(bias),
386                       params->stride_width, params->stride_height,
387                       data->padding.width, data->padding.height,
388                       output_activation_min, output_activation_max,
389                       GetTensorData<float>(output), GetTensorDims(output),
390                       GetTensorData<float>(im2col), GetTensorDims(im2col));
391       break;
392     }
393   }
394 }
395 
396 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)397 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
398   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
399   OpData* data = reinterpret_cast<OpData*>(node->user_data);
400 
401   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
402   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
403   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
404   bool hasBias = node->inputs->size == 3;
405   TfLiteTensor* bias =
406       hasBias ? &context->tensors[node->inputs->data[2]] : nullptr;
407   TfLiteTensor* im2col =
408       data->need_im2col
409           ? &context->tensors[node->temporaries->data[data->im2col_index]]
410           : nullptr;
411   TfLiteTensor* hwcn_weights =
412       data->need_hwcn_weights
413           ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
414           : nullptr;
415 
416   if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
417     TransposeFloatTensor(filter, hwcn_weights);
418     data->have_weights_been_transposed = true;
419   }
420 
421   // TODO(aselle): Consider whether float conv and quantized conv should be
422   // separate ops to avoid dispatch overhead here.
423   switch (input->type) {  // Already know in/outtypes are same.
424     case kTfLiteFloat32:
425       EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
426                              im2col, hwcn_weights, output);
427       break;
428     case kTfLiteUInt8:
429       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
430                                  bias, im2col, hwcn_weights, output);
431       break;
432     default:
433       context->ReportError(context, "Type not currently supported.");
434       return kTfLiteError;
435   }
436   return kTfLiteOk;
437 }
438 
439 }  // namespace conv
440 
Register_CONVOLUTION_REF()441 TfLiteRegistration* Register_CONVOLUTION_REF() {
442   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
443                                  conv::Eval<conv::kReference>};
444   return &r;
445 }
446 
Register_CONVOLUTION_GENERIC_OPT()447 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
448   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
449                                  conv::Eval<conv::kGenericOptimized>};
450   return &r;
451 }
452 
Register_CONVOLUTION_MULTITHREADED_OPT()453 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
454   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
455                                  conv::Eval<conv::kMultithreadOptimized>};
456   return &r;
457 }
458 
Register_CONVOLUTION_CBLAS_OPT()459 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
460   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
461                                  conv::Eval<conv::kCblasOptimized>};
462   return &r;
463 }
464 
Register_CONV_2D()465 TfLiteRegistration* Register_CONV_2D() {
466 #ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
467   return Register_CONVOLUTION_CBLAS_OPT();
468 #else
469   return Register_CONVOLUTION_MULTITHREADED_OPT();
470 #endif
471 }
472 
473 }  // namespace builtin
474 }  // namespace ops
475 }  // namespace tflite
476