1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cassert>
17 #include <cmath>
18 #include <cstdio>
19 #include <cstdlib>
20 #include <iostream>
21 #include <limits>
22 
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/c_api_internal.h"
25 #include "tensorflow/lite/kernels/eigen_support.h"
26 #include "tensorflow/lite/kernels/gemm_support.h"
27 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
28 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
29 #include "tensorflow/lite/kernels/internal/quantization_util.h"
30 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
31 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
32 #include "tensorflow/lite/kernels/internal/tensor.h"
33 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
34 #include "tensorflow/lite/kernels/kernel_util.h"
35 #include "tensorflow/lite/kernels/op_macros.h"
36 #include "tensorflow/lite/kernels/padding.h"
37 
38 namespace tflite {
39 namespace ops {
40 namespace builtin {
41 namespace conv {
42 
43 // This file has 4 implementation of Conv.
44 enum KernelType {
45   kReference,
46   kGenericOptimized,  // Neon-free
47   // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
48   // are available and kGenericOptimized when we must use only one thread.
49   kMultithreadOptimized,
50   // The kernel uses use CBLAS interface for matrix multiplication.
51   // It's fast when an optimized CBLAS implementation is available (e.g. Apple
52   // Accelerate Framework), and it's slow when falling back to naive
53   // implementation.
54   kCblasOptimized,
55 };
56 
57 const int kTensorNotAllocated = -1;
58 
59 struct OpData {
60   // IDs are the arbitrary identifiers used by TF Lite to identify and access
61   // memory buffers.
62   int im2col_id = kTensorNotAllocated;
63   int hwcn_weights_id = kTensorNotAllocated;
64   int input_quantized_id = kTensorNotAllocated;
65   int scaling_factors_id = kTensorNotAllocated;
66 
67   TfLitePaddingValues padding;
68   // The scaling factor from input to output (aka the 'real multiplier') can
69   // be represented as a fixed point multiplier plus a left shift.
70   int32_t output_multiplier;
71   int output_shift;
72 
73   // Per channel output multiplier and shift.
74   std::vector<int32_t> per_channel_output_multiplier;
75   std::vector<int> per_channel_output_shift;
76 
77   // The range of the fused activation layer. For example for kNone and
78   // uint8_t these would be 0 and 255.
79   int32_t output_activation_min;
80   int32_t output_activation_max;
81   // Indexes are the offset to the memory buffer in the array used to keep track
82   // of the allocated temporaries.
83   int32_t im2col_index;
84   int32_t hwcn_weights_index;
85   int32_t input_quantized_index;
86   int32_t scaling_factors_index;
87   bool need_hwcn_weights;
88   bool have_weights_been_transposed;
89   bool need_im2col;
90 
91   bool run_multithreaded_kernel;
92 };
93 
RuntimePaddingType(TfLitePadding padding)94 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
95   switch (padding) {
96     case TfLitePadding::kTfLitePaddingSame:
97       return PaddingType::kSame;
98     case TfLitePadding::kTfLitePaddingValid:
99       return PaddingType::kValid;
100     case TfLitePadding::kTfLitePaddingUnknown:
101     default:
102       return PaddingType::kNone;
103   }
104 }
105 
Init(TfLiteContext * context,const char * buffer,size_t length)106 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
107   // This is a builtin op, so we don't use the contents in 'buffer', if any.
108   // Instead, we allocate a new object to use as scratch space for im2col, and
109   // to carry information from Prepare() to Eval().
110   auto* data = new OpData;
111   gemm_support::IncrementUsageCounter(context);
112   eigen_support::IncrementUsageCounter(context);
113   return data;
114 }
115 
Free(TfLiteContext * context,void * buffer)116 void Free(TfLiteContext* context, void* buffer) {
117   eigen_support::DecrementUsageCounter(context);
118   gemm_support::DecrementUsageCounter(context);
119   delete reinterpret_cast<OpData*>(buffer);
120 }
121 
122 // Naive implementation of transpose for floats. Could be optimized to be more
123 // cache friendly, but for now it's a one-time cost on first run, and we would
124 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(TfLiteTensor * input,TfLiteTensor * output)125 void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
126   const int rows = output->dims->data[1];
127   const int cols = output->dims->data[0];
128   const float* input_data = GetTensorData<float>(input);
129   float* output_data = GetTensorData<float>(output);
130   for (int i = 0; i < rows; ++i) {
131     for (int j = 0; j < cols; ++j) {
132       const float in_value = input_data[i * cols + j];
133       output_data[j * rows + i] = in_value;
134     }
135   }
136 }
137 
138 // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
139 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
140 // Therefore the logic to add tensors are isolated into this function.
AllocateTemporaryTensorsIfRequired(TfLiteContext * context,TfLiteNode * node,bool is_hybrid)141 static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
142                                                        TfLiteNode* node,
143                                                        bool is_hybrid) {
144   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
145   OpData* data = reinterpret_cast<OpData*>(node->user_data);
146 
147   TF_LITE_ENSURE(context, node->inputs->size >= 2);
148   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
149   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
150 
151   int filter_width = filter->dims->data[2];
152   int filter_height = filter->dims->data[1];
153 
154   // We don't always need to allocate im2col. It is only used in some versions
155   // of the optimized Conv. This test just mimics something that happens inside
156   // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
157   data->need_im2col =
158       (params->stride_width != 1 || params->stride_height != 1 ||
159        params->dilation_width_factor != 1 ||
160        params->dilation_height_factor != 1 || filter_width != 1 ||
161        filter_height != 1);
162   // If we're using the optimized multithreaded EigenTensor implementation of
163   // convolution, it expects the filter weights to be transposed compared to
164   // the normal TF Lite buffer format. Typical TF Lite weights are
165   // [filter_count, filter_height, filter_width, input_depth], but for the float
166   // implementation we need them as [filter_height, filter_width, input_depth,
167   // filter_count]. We get to that format by transposing, and create a temporary
168   // buffer to store the results.
169   // This path is only used for float processing, so only create the buffer if
170   // we're running with that data type.
171   data->need_hwcn_weights = (input->type == kTfLiteFloat32 &&
172                              data->run_multithreaded_kernel && !is_hybrid);
173 
174   int temporaries_count = 0;
175   if (data->need_im2col) {
176     data->im2col_index = temporaries_count;
177     if (data->im2col_id == kTensorNotAllocated) {
178       context->AddTensors(context, 1, &data->im2col_id);
179     }
180     ++temporaries_count;
181   }
182   if (data->need_hwcn_weights) {
183     data->hwcn_weights_index = temporaries_count;
184     if (data->hwcn_weights_id == kTensorNotAllocated) {
185       context->AddTensors(context, 1, &data->hwcn_weights_id);
186     }
187     ++temporaries_count;
188   }
189 
190   if (is_hybrid) {
191     // Allocate tensor to store the on-the-fly quantized inputs.
192     data->input_quantized_index = temporaries_count;
193     if (data->input_quantized_id == kTensorNotAllocated) {
194       TF_LITE_ENSURE_OK(
195           context, context->AddTensors(context, 1, &data->input_quantized_id));
196     }
197     ++temporaries_count;
198 
199     // Allocate tensor to store the quantization params computed during
200     // on-the-fly input quantization.
201     data->scaling_factors_index = temporaries_count;
202     if (data->scaling_factors_id == kTensorNotAllocated) {
203       TF_LITE_ENSURE_OK(
204           context, context->AddTensors(context, 1, &data->scaling_factors_id));
205     }
206     ++temporaries_count;
207   }
208 
209   TfLiteIntArrayFree(node->temporaries);
210   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
211 
212   return kTfLiteOk;
213 }
214 
Prepare(TfLiteContext * context,TfLiteNode * node)215 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
216   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
217   OpData* data = reinterpret_cast<OpData*>(node->user_data);
218 
219   bool has_bias = node->inputs->size == 3;
220   // Check number of inputs/outputs
221   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
222   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
223   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
224   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
225   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
226 
227   // Check dimensionality of input, filter
228   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
229   TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
230   // Check input channels matching filter
231   TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
232 
233   // Check types. (We assume that UINT8 refers to quantized tensors)
234   TfLiteType input_type = input->type;
235   TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
236                               input_type == kTfLiteUInt8 ||
237                               input_type == kTfLiteInt8);
238   TF_LITE_ENSURE_EQ(context, output->type, input_type);
239 
240   TfLiteTensor* bias = nullptr;
241 
242   // TODO(ahentz): At this point the optimized versions require 'bias'. We can
243   // either change that or document that convolution requires it.
244   TF_LITE_ENSURE(context, has_bias);
245 
246   if (has_bias) {
247     bias = &context->tensors[node->inputs->data[2]];
248     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
249       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
250       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
251     } else {
252       TF_LITE_ENSURE_EQ(context, bias->type, input_type);
253     }
254     TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
255   }
256 
257   const bool is_hybrid =
258       (input->type == kTfLiteFloat32 &&
259        (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
260 
261   data->run_multithreaded_kernel = context->recommended_num_threads != 1;
262   // Hybrid kernels don't support multithreading yet.
263   if (is_hybrid) {
264     data->run_multithreaded_kernel = false;
265   }
266 
267   TF_LITE_ENSURE_STATUS(
268       AllocateTemporaryTensorsIfRequired(context, node, is_hybrid));
269 
270   int channels_in = filter->dims->data[3];
271   int channels_out = filter->dims->data[0];
272   int width = input->dims->data[2];
273   int height = input->dims->data[1];
274   int filter_width = filter->dims->data[2];
275   int filter_height = filter->dims->data[1];
276   int batches = input->dims->data[0];
277 
278   // Matching GetWindowedOutputSize in TensorFlow.
279   auto padding = params->padding;
280   auto compute_out_size = [padding](int image_size, int filter_size, int stride,
281                                     int dilation_rate) -> int {
282     int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
283     return padding == kTfLitePaddingSame
284                ? (image_size + stride - 1) / stride
285                : padding == kTfLitePaddingValid
286                      ? (image_size - effective_filter_size + stride) / stride
287                      : 0;
288   };
289 
290   int out_width = compute_out_size(width, filter_width, params->stride_width,
291                                    params->dilation_width_factor);
292   int out_height =
293       compute_out_size(height, filter_height, params->stride_height,
294                        params->dilation_height_factor);
295 
296   data->padding.height =
297       ComputePadding(params->stride_height, params->dilation_height_factor,
298                      height, filter_height, out_height);
299   data->padding.width =
300       ComputePadding(params->stride_width, params->dilation_width_factor, width,
301                      filter_width, out_width);
302 
303   TF_LITE_ENSURE(context, has_bias);
304 
305   // Note that full fixed-point inference requires that all tensors have their
306   // parameters set. This is usually done during quantized training or
307   // calibration.
308   if (input_type != kTfLiteFloat32) {
309     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
310                       kTfLiteAffineQuantization);
311     const auto* affine_quantization =
312         reinterpret_cast<TfLiteAffineQuantization*>(
313             filter->quantization.params);
314     TF_LITE_ENSURE(context, affine_quantization);
315     TF_LITE_ENSURE(context, affine_quantization->scale);
316     const int number_channel = affine_quantization->scale->size;
317     data->per_channel_output_multiplier.resize(number_channel);
318     data->per_channel_output_shift.resize(number_channel);
319     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
320         context, input, filter, bias, output, params->activation,
321         &data->output_multiplier, &data->output_shift,
322         &data->output_activation_min, &data->output_activation_max,
323         data->per_channel_output_multiplier.data(),
324         data->per_channel_output_shift.data()));
325   }
326 
327   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
328   output_size->data[0] = batches;
329   output_size->data[1] = out_height;
330   output_size->data[2] = out_width;
331   output_size->data[3] = channels_out;
332   auto output_status = context->ResizeTensor(context, output, output_size);
333 
334   if (output_status != kTfLiteOk) return output_status;
335 
336   if (data->need_im2col) {
337     node->temporaries->data[data->im2col_index] = data->im2col_id;
338 
339     TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
340 
341     int input_depth = input->dims->data[3];
342     im2col_size->data[0] = output_size->data[0];
343     im2col_size->data[1] = output_size->data[1];
344     im2col_size->data[2] = output_size->data[2];
345     im2col_size->data[3] = input_depth * filter_height * filter_width;
346 
347     TfLiteTensor* im2col =
348         &context->tensors[node->temporaries->data[data->im2col_index]];
349     im2col->type = input->type;
350     if (is_hybrid) {
351       im2col->type = filter->type;
352     }
353     im2col->allocation_type = kTfLiteArenaRw;
354     auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
355     if (im2col_status != kTfLiteOk) return im2col_status;
356   }
357 
358   if (data->need_hwcn_weights) {
359     node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
360     TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
361 
362     // Because we're treating the filter weights as a matrix when we do the
363     // transpose, we allocate the buffer with a two-dimensional shape, where one
364     // dimension is the number of elements in each filter, and the second is the
365     // total number of filters.
366     int input_depth = input->dims->data[3];
367     hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
368     hwcn_weights_size->data[1] = channels_out;
369 
370     TfLiteTensor* hwcn_weights =
371         &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
372     hwcn_weights->type = input_type;
373     hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
374 
375     auto hwcn_weights_status =
376         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
377     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
378 
379     // TODO(petewarden): If Resize() is called when the size hasn't actually
380     // changed, this will do extra redundant work.
381     data->have_weights_been_transposed = false;
382   }
383 
384   if (is_hybrid) {
385     node->temporaries->data[data->input_quantized_index] =
386         data->input_quantized_id;
387     TfLiteTensor* input_quantized =
388         GetTemporary(context, node, data->input_quantized_index);
389     input_quantized->type = kTfLiteInt8;
390     input_quantized->allocation_type = kTfLiteArenaRw;
391     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
392       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
393       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
394                                                        input_quantized_size));
395     }
396 
397     node->temporaries->data[data->scaling_factors_index] =
398         data->scaling_factors_id;
399     TfLiteTensor* scaling_factors =
400         GetTemporary(context, node, data->scaling_factors_index);
401     scaling_factors->type = kTfLiteFloat32;
402     scaling_factors->allocation_type = kTfLiteArenaRw;
403     // Only one scale factor per batch is typically necessary. See optimized
404     // implementation for why we need to allocate for the height of the inputs
405     // flattened to 2D.
406     const int height = NumElements(input) / channels_in;
407     int scaling_dims[1] = {height};
408     if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
409       TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
410       scaling_factors_size->data[0] = height;
411       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
412                                                        scaling_factors_size));
413     }
414   }
415 
416   return kTfLiteOk;
417 }
418 
419 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)420 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
421                    TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
422                    TfLiteTensor* filter, TfLiteTensor* bias,
423                    TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
424                    TfLiteTensor* output) {
425   gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
426 
427   auto input_offset = -input->params.zero_point;
428   auto filter_offset = -filter->params.zero_point;
429   auto output_offset = output->params.zero_point;
430 
431   KernelType effective_kernel_type;
432   if ((kernel_type == kMultithreadOptimized ||
433        kernel_type == kCblasOptimized) &&
434       (params->dilation_width_factor != 1 ||
435        params->dilation_height_factor != 1)) {
436     // kMultithreadOptimized and kCblasOptimized do not support dilation.
437     // Therefore, fallback to optimized.
438     effective_kernel_type = kGenericOptimized;
439   } else {
440     effective_kernel_type = kernel_type;
441   }
442 
443   switch (effective_kernel_type) {
444     case kReference: {
445       ConvParams op_params;
446       op_params.padding_type = PaddingType::kSame;
447       op_params.padding_values.width = data->padding.width;
448       op_params.padding_values.height = data->padding.height;
449       op_params.stride_width = params->stride_width;
450       op_params.stride_height = params->stride_height;
451       op_params.dilation_width_factor = params->dilation_width_factor;
452       op_params.dilation_height_factor = params->dilation_height_factor;
453       op_params.input_offset = input_offset;
454       op_params.weights_offset = filter_offset;
455       op_params.output_offset = output_offset;
456       op_params.output_multiplier = data->output_multiplier;
457       op_params.output_shift = -data->output_shift;
458       op_params.quantized_activation_min = data->output_activation_min;
459       op_params.quantized_activation_max = data->output_activation_max;
460       reference_ops::Conv(
461           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
462           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
463           GetTensorShape(bias), GetTensorData<int32_t>(bias),
464           GetTensorShape(output), GetTensorData<uint8_t>(output),
465           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context);
466       break;
467     }
468     case kGenericOptimized:
469     case kMultithreadOptimized:
470     case kCblasOptimized: {
471       // There is only one optimized implementation for Quantized Conv.
472       ConvParams op_params;
473       op_params.padding_type = PaddingType::kSame;
474       op_params.padding_values.width = data->padding.width;
475       op_params.padding_values.height = data->padding.height;
476       op_params.stride_width = params->stride_width;
477       op_params.stride_height = params->stride_height;
478       op_params.dilation_width_factor = params->dilation_width_factor;
479       op_params.dilation_height_factor = params->dilation_height_factor;
480       op_params.input_offset = input_offset;
481       op_params.weights_offset = filter_offset;
482       op_params.output_offset = output_offset;
483       op_params.output_multiplier = data->output_multiplier;
484       op_params.output_shift = -data->output_shift;
485       op_params.quantized_activation_min = data->output_activation_min;
486       op_params.quantized_activation_max = data->output_activation_max;
487       optimized_ops::Conv(
488           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
489           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
490           GetTensorShape(bias), GetTensorData<int32_t>(bias),
491           GetTensorShape(output), GetTensorData<uint8_t>(output),
492           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context);
493       break;
494     }
495   }
496 }
497 
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * output)498 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
499                              TfLiteConvParams* params, OpData* data,
500                              TfLiteTensor* input, TfLiteTensor* filter,
501                              TfLiteTensor* bias, TfLiteTensor* output) {
502   ConvParams op_params;
503   op_params.input_offset = input->params.zero_point;
504   op_params.output_offset = output->params.zero_point;
505   op_params.stride_height = params->stride_height;
506   op_params.stride_width = params->stride_width;
507   op_params.dilation_height_factor = params->dilation_height_factor;
508   op_params.dilation_width_factor = params->dilation_width_factor;
509   op_params.padding_values.height = data->padding.height;
510   op_params.padding_values.width = data->padding.width;
511 
512   reference_integer_ops::ConvPerChannel(
513       op_params, data->per_channel_output_multiplier.data(),
514       data->per_channel_output_shift.data(), GetTensorShape(input),
515       GetTensorData<int8>(input), GetTensorShape(filter),
516       GetTensorData<int8>(filter), GetTensorShape(bias),
517       GetTensorData<int32>(bias), GetTensorShape(output),
518       GetTensorData<int8>(output));
519 }
520 
521 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)522 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
523                TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
524                TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
525                TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
526   float output_activation_min, output_activation_max;
527   CalculateActivationRange(params->activation, &output_activation_min,
528                            &output_activation_max);
529   KernelType effective_kernel_type;
530   if ((kernel_type == kMultithreadOptimized) &&
531       (params->dilation_width_factor != 1 ||
532        params->dilation_height_factor != 1)) {
533     // kMultithreadOptimized does not support dilation.
534     // Therefore, fallback to optimized.
535     effective_kernel_type = kGenericOptimized;
536   } else {
537     effective_kernel_type = kernel_type;
538   }
539   ConvParams op_params;
540   op_params.padding_type = RuntimePaddingType(params->padding);
541   op_params.padding_values.width = data->padding.width;
542   op_params.padding_values.height = data->padding.height;
543   op_params.stride_width = params->stride_width;
544   op_params.stride_height = params->stride_height;
545   op_params.dilation_width_factor = params->dilation_width_factor;
546   op_params.dilation_height_factor = params->dilation_height_factor;
547   op_params.float_activation_min = output_activation_min;
548   op_params.float_activation_max = output_activation_max;
549   switch (effective_kernel_type) {
550     case kReference: {
551       reference_ops::Conv(op_params, GetTensorShape(input),
552                           GetTensorData<float>(input), GetTensorShape(filter),
553                           GetTensorData<float>(filter), GetTensorShape(bias),
554                           GetTensorData<float>(bias), GetTensorShape(output),
555                           GetTensorData<float>(output), GetTensorShape(im2col),
556                           GetTensorData<float>(im2col));
557       break;
558     }
559     case kCblasOptimized:
560     case kGenericOptimized: {
561       optimized_ops::Conv(op_params, GetTensorShape(input),
562                           GetTensorData<float>(input), GetTensorShape(filter),
563                           GetTensorData<float>(filter), GetTensorShape(bias),
564                           GetTensorData<float>(bias), GetTensorShape(output),
565                           GetTensorData<float>(output), GetTensorShape(im2col),
566                           GetTensorData<float>(im2col));
567       break;
568     }
569     case kMultithreadOptimized: {
570       const float* filter_data;
571       if (data->need_hwcn_weights) {
572         filter_data = GetTensorData<float>(hwcn_weights);
573       } else {
574         filter_data = GetTensorData<float>(filter);
575       }
576       multithreaded_ops::Conv(
577           *eigen_support::GetThreadPoolDevice(context), op_params,
578           GetTensorShape(input), GetTensorData<float>(input),
579           GetTensorShape(filter), filter_data, GetTensorShape(bias),
580           GetTensorData<float>(bias), GetTensorShape(output),
581           GetTensorData<float>(output), GetTensorShape(im2col),
582           GetTensorData<float>(im2col));
583       break;
584     }
585   }
586 }
587 
588 template <KernelType kernel_type>
EvalHybrid(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)589 void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
590                 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
591                 TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
592                 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
593   float output_activation_min, output_activation_max;
594   CalculateActivationRange(params->activation, &output_activation_min,
595                            &output_activation_max);
596 
597   const int input_size = NumElements(input) / SizeOfDimension(input, 0);
598   const int batch_size = SizeOfDimension(input, 0);
599 
600   const TfLiteTensor* input_quantized =
601       GetTemporary(context, node, data->input_quantized_index);
602   int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
603   float* scaling_factors_ptr =
604       GetTemporary(context, node, data->scaling_factors_index)->data.f;
605 
606   // Per-batch input quantization for higher accuracy.
607   for (int b = 0; b < batch_size; ++b) {
608     float unused_min, unused_max;
609     const int offset = b * input_size;
610     tensor_utils::SymmetricQuantizeFloats(
611         input->data.f + offset, input_size, quantized_input_ptr_batch + offset,
612         &unused_min, &unused_max, &scaling_factors_ptr[b]);
613     scaling_factors_ptr[b] *= filter->params.scale;
614   }
615 
616   int8_t* im2col_ptr = nullptr;
617   int8_t* filter_ptr = nullptr;
618   if (filter->type == kTfLiteUInt8) {
619     // For backward compatibility, we need to support the case where filters
620     // are quantized to int8 but stored as uint8.
621     if (im2col != nullptr) {
622       im2col_ptr = reinterpret_cast<int8_t*>(im2col->data.uint8);
623     }
624     filter_ptr = reinterpret_cast<int8_t*>(filter->data.uint8);
625   } else {
626     // Code at head uses the int8 type so we do not need to do the cast.
627     if (im2col != nullptr) {
628       im2col_ptr = im2col->data.int8;
629     }
630     filter_ptr = filter->data.int8;
631   }
632 
633   switch (kernel_type) {
634     case kReference:
635     case kGenericOptimized:
636     case kMultithreadOptimized:
637     case kCblasOptimized: {
638       // There is only one implementation for hybrid kernel. Note
639       // this does not make use of gemmlowp nor supports multithreading.
640       ConvParams op_params;
641       op_params.padding_type = PaddingType::kSame;
642       op_params.padding_values.width = data->padding.width;
643       op_params.padding_values.height = data->padding.height;
644       op_params.stride_width = params->stride_width;
645       op_params.stride_height = params->stride_height;
646       op_params.dilation_width_factor = 1;
647       op_params.dilation_height_factor = 1;
648       op_params.float_activation_min = output_activation_min;
649       op_params.float_activation_max = output_activation_max;
650       optimized_ops::HybridConv(
651           op_params, scaling_factors_ptr, GetTensorShape(input),
652           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
653           GetTensorShape(bias), GetTensorData<float>(bias),
654           GetTensorShape(output), GetTensorData<float>(output),
655           GetTensorShape(im2col), im2col_ptr);
656       break;
657     }
658   }
659 }
660 
661 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)662 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
663   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
664   OpData* data = reinterpret_cast<OpData*>(node->user_data);
665 
666   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
667   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
668   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
669   bool has_bias = node->inputs->size == 3;
670   TfLiteTensor* bias =
671       has_bias ? &context->tensors[node->inputs->data[2]] : nullptr;
672   TfLiteTensor* im2col =
673       data->need_im2col
674           ? &context->tensors[node->temporaries->data[data->im2col_index]]
675           : nullptr;
676   TfLiteTensor* hwcn_weights =
677       data->need_hwcn_weights
678           ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
679           : nullptr;
680 
681   if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
682     TransposeFloatTensor(filter, hwcn_weights);
683     data->have_weights_been_transposed = true;
684   }
685 
686   // TODO(aselle): Consider whether float conv and quantized conv should be
687   // separate ops to avoid dispatch overhead here.
688   switch (input->type) {  // Already know in/outtypes are same.
689     case kTfLiteFloat32:
690       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
691         EvalHybrid<kernel_type>(context, node, params, data, input, filter,
692                                 bias, im2col, hwcn_weights, output);
693       } else if (data->run_multithreaded_kernel) {
694         EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
695                                im2col, hwcn_weights, output);
696       } else {
697         EvalFloat<kGenericOptimized>(context, node, params, data, input, filter,
698                                      bias, im2col, hwcn_weights, output);
699       }
700       break;
701     case kTfLiteUInt8:
702       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
703                                  bias, im2col, hwcn_weights, output);
704       break;
705     case kTfLiteInt8:
706       EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
707                               output);
708       break;
709     default:
710       context->ReportError(context, "Type %d not currently supported.",
711                            input->type);
712       return kTfLiteError;
713   }
714   return kTfLiteOk;
715 }
716 
717 }  // namespace conv
718 
Register_CONVOLUTION_REF()719 TfLiteRegistration* Register_CONVOLUTION_REF() {
720   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
721                                  conv::Eval<conv::kReference>};
722   return &r;
723 }
724 
Register_CONVOLUTION_GENERIC_OPT()725 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
726   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
727                                  conv::Eval<conv::kGenericOptimized>};
728   return &r;
729 }
730 
Register_CONVOLUTION_MULTITHREADED_OPT()731 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
732   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
733                                  conv::Eval<conv::kMultithreadOptimized>};
734   return &r;
735 }
736 
Register_CONVOLUTION_CBLAS_OPT()737 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
738   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
739                                  conv::Eval<conv::kCblasOptimized>};
740   return &r;
741 }
742 
Register_CONV_2D()743 TfLiteRegistration* Register_CONV_2D() {
744 #ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
745   return Register_CONVOLUTION_CBLAS_OPT();
746 #else
747   return Register_CONVOLUTION_MULTITHREADED_OPT();
748 #endif
749 }
750 
751 }  // namespace builtin
752 }  // namespace ops
753 }  // namespace tflite
754