1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cassert>
17 #include <cmath>
18 #include <cstdio>
19 #include <cstdlib>
20 #include <iostream>
21 #include <limits>
22
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/c_api_internal.h"
25 #include "tensorflow/lite/kernels/eigen_support.h"
26 #include "tensorflow/lite/kernels/gemm_support.h"
27 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
28 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
29 #include "tensorflow/lite/kernels/internal/quantization_util.h"
30 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
31 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
32 #include "tensorflow/lite/kernels/internal/tensor.h"
33 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
34 #include "tensorflow/lite/kernels/kernel_util.h"
35 #include "tensorflow/lite/kernels/op_macros.h"
36 #include "tensorflow/lite/kernels/padding.h"
37
38 namespace tflite {
39 namespace ops {
40 namespace builtin {
41 namespace conv {
42
43 // This file has 4 implementation of Conv.
44 enum KernelType {
45 kReference,
46 kGenericOptimized, // Neon-free
47 // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
48 // are available and kGenericOptimized when we must use only one thread.
49 kMultithreadOptimized,
50 // The kernel uses use CBLAS interface for matrix multiplication.
51 // It's fast when an optimized CBLAS implementation is available (e.g. Apple
52 // Accelerate Framework), and it's slow when falling back to naive
53 // implementation.
54 kCblasOptimized,
55 };
56
57 const int kTensorNotAllocated = -1;
58
59 struct OpData {
60 // IDs are the arbitrary identifiers used by TF Lite to identify and access
61 // memory buffers.
62 int im2col_id = kTensorNotAllocated;
63 int hwcn_weights_id = kTensorNotAllocated;
64 int input_quantized_id = kTensorNotAllocated;
65 int scaling_factors_id = kTensorNotAllocated;
66
67 TfLitePaddingValues padding;
68 // The scaling factor from input to output (aka the 'real multiplier') can
69 // be represented as a fixed point multiplier plus a left shift.
70 int32_t output_multiplier;
71 int output_shift;
72
73 // Per channel output multiplier and shift.
74 std::vector<int32_t> per_channel_output_multiplier;
75 std::vector<int> per_channel_output_shift;
76
77 // The range of the fused activation layer. For example for kNone and
78 // uint8_t these would be 0 and 255.
79 int32_t output_activation_min;
80 int32_t output_activation_max;
81 // Indexes are the offset to the memory buffer in the array used to keep track
82 // of the allocated temporaries.
83 int32_t im2col_index;
84 int32_t hwcn_weights_index;
85 int32_t input_quantized_index;
86 int32_t scaling_factors_index;
87 bool need_hwcn_weights;
88 bool have_weights_been_transposed;
89 bool need_im2col;
90
91 bool run_multithreaded_kernel;
92 };
93
RuntimePaddingType(TfLitePadding padding)94 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
95 switch (padding) {
96 case TfLitePadding::kTfLitePaddingSame:
97 return PaddingType::kSame;
98 case TfLitePadding::kTfLitePaddingValid:
99 return PaddingType::kValid;
100 case TfLitePadding::kTfLitePaddingUnknown:
101 default:
102 return PaddingType::kNone;
103 }
104 }
105
Init(TfLiteContext * context,const char * buffer,size_t length)106 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
107 // This is a builtin op, so we don't use the contents in 'buffer', if any.
108 // Instead, we allocate a new object to use as scratch space for im2col, and
109 // to carry information from Prepare() to Eval().
110 auto* data = new OpData;
111 gemm_support::IncrementUsageCounter(context);
112 eigen_support::IncrementUsageCounter(context);
113 return data;
114 }
115
Free(TfLiteContext * context,void * buffer)116 void Free(TfLiteContext* context, void* buffer) {
117 eigen_support::DecrementUsageCounter(context);
118 gemm_support::DecrementUsageCounter(context);
119 delete reinterpret_cast<OpData*>(buffer);
120 }
121
122 // Naive implementation of transpose for floats. Could be optimized to be more
123 // cache friendly, but for now it's a one-time cost on first run, and we would
124 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(TfLiteTensor * input,TfLiteTensor * output)125 void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
126 const int rows = output->dims->data[1];
127 const int cols = output->dims->data[0];
128 const float* input_data = GetTensorData<float>(input);
129 float* output_data = GetTensorData<float>(output);
130 for (int i = 0; i < rows; ++i) {
131 for (int j = 0; j < cols; ++j) {
132 const float in_value = input_data[i * cols + j];
133 output_data[j * rows + i] = in_value;
134 }
135 }
136 }
137
138 // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
139 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
140 // Therefore the logic to add tensors are isolated into this function.
AllocateTemporaryTensorsIfRequired(TfLiteContext * context,TfLiteNode * node,bool is_hybrid)141 static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
142 TfLiteNode* node,
143 bool is_hybrid) {
144 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
145 OpData* data = reinterpret_cast<OpData*>(node->user_data);
146
147 TF_LITE_ENSURE(context, node->inputs->size >= 2);
148 TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
149 TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
150
151 int filter_width = filter->dims->data[2];
152 int filter_height = filter->dims->data[1];
153
154 // We don't always need to allocate im2col. It is only used in some versions
155 // of the optimized Conv. This test just mimics something that happens inside
156 // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
157 data->need_im2col =
158 (params->stride_width != 1 || params->stride_height != 1 ||
159 params->dilation_width_factor != 1 ||
160 params->dilation_height_factor != 1 || filter_width != 1 ||
161 filter_height != 1);
162 // If we're using the optimized multithreaded EigenTensor implementation of
163 // convolution, it expects the filter weights to be transposed compared to
164 // the normal TF Lite buffer format. Typical TF Lite weights are
165 // [filter_count, filter_height, filter_width, input_depth], but for the float
166 // implementation we need them as [filter_height, filter_width, input_depth,
167 // filter_count]. We get to that format by transposing, and create a temporary
168 // buffer to store the results.
169 // This path is only used for float processing, so only create the buffer if
170 // we're running with that data type.
171 data->need_hwcn_weights = (input->type == kTfLiteFloat32 &&
172 data->run_multithreaded_kernel && !is_hybrid);
173
174 int temporaries_count = 0;
175 if (data->need_im2col) {
176 data->im2col_index = temporaries_count;
177 if (data->im2col_id == kTensorNotAllocated) {
178 context->AddTensors(context, 1, &data->im2col_id);
179 }
180 ++temporaries_count;
181 }
182 if (data->need_hwcn_weights) {
183 data->hwcn_weights_index = temporaries_count;
184 if (data->hwcn_weights_id == kTensorNotAllocated) {
185 context->AddTensors(context, 1, &data->hwcn_weights_id);
186 }
187 ++temporaries_count;
188 }
189
190 if (is_hybrid) {
191 // Allocate tensor to store the on-the-fly quantized inputs.
192 data->input_quantized_index = temporaries_count;
193 if (data->input_quantized_id == kTensorNotAllocated) {
194 TF_LITE_ENSURE_OK(
195 context, context->AddTensors(context, 1, &data->input_quantized_id));
196 }
197 ++temporaries_count;
198
199 // Allocate tensor to store the quantization params computed during
200 // on-the-fly input quantization.
201 data->scaling_factors_index = temporaries_count;
202 if (data->scaling_factors_id == kTensorNotAllocated) {
203 TF_LITE_ENSURE_OK(
204 context, context->AddTensors(context, 1, &data->scaling_factors_id));
205 }
206 ++temporaries_count;
207 }
208
209 TfLiteIntArrayFree(node->temporaries);
210 node->temporaries = TfLiteIntArrayCreate(temporaries_count);
211
212 return kTfLiteOk;
213 }
214
Prepare(TfLiteContext * context,TfLiteNode * node)215 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
216 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
217 OpData* data = reinterpret_cast<OpData*>(node->user_data);
218
219 bool has_bias = node->inputs->size == 3;
220 // Check number of inputs/outputs
221 TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
222 TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
223 TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
224 TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
225 TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
226
227 // Check dimensionality of input, filter
228 TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
229 TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
230 // Check input channels matching filter
231 TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
232
233 // Check types. (We assume that UINT8 refers to quantized tensors)
234 TfLiteType input_type = input->type;
235 TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
236 input_type == kTfLiteUInt8 ||
237 input_type == kTfLiteInt8);
238 TF_LITE_ENSURE_EQ(context, output->type, input_type);
239
240 TfLiteTensor* bias = nullptr;
241
242 // TODO(ahentz): At this point the optimized versions require 'bias'. We can
243 // either change that or document that convolution requires it.
244 TF_LITE_ENSURE(context, has_bias);
245
246 if (has_bias) {
247 bias = &context->tensors[node->inputs->data[2]];
248 if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
249 TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
250 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
251 } else {
252 TF_LITE_ENSURE_EQ(context, bias->type, input_type);
253 }
254 TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
255 }
256
257 const bool is_hybrid =
258 (input->type == kTfLiteFloat32 &&
259 (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
260
261 data->run_multithreaded_kernel = context->recommended_num_threads != 1;
262 // Hybrid kernels don't support multithreading yet.
263 if (is_hybrid) {
264 data->run_multithreaded_kernel = false;
265 }
266
267 TF_LITE_ENSURE_STATUS(
268 AllocateTemporaryTensorsIfRequired(context, node, is_hybrid));
269
270 int channels_in = filter->dims->data[3];
271 int channels_out = filter->dims->data[0];
272 int width = input->dims->data[2];
273 int height = input->dims->data[1];
274 int filter_width = filter->dims->data[2];
275 int filter_height = filter->dims->data[1];
276 int batches = input->dims->data[0];
277
278 // Matching GetWindowedOutputSize in TensorFlow.
279 auto padding = params->padding;
280 auto compute_out_size = [padding](int image_size, int filter_size, int stride,
281 int dilation_rate) -> int {
282 int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
283 return padding == kTfLitePaddingSame
284 ? (image_size + stride - 1) / stride
285 : padding == kTfLitePaddingValid
286 ? (image_size - effective_filter_size + stride) / stride
287 : 0;
288 };
289
290 int out_width = compute_out_size(width, filter_width, params->stride_width,
291 params->dilation_width_factor);
292 int out_height =
293 compute_out_size(height, filter_height, params->stride_height,
294 params->dilation_height_factor);
295
296 data->padding.height =
297 ComputePadding(params->stride_height, params->dilation_height_factor,
298 height, filter_height, out_height);
299 data->padding.width =
300 ComputePadding(params->stride_width, params->dilation_width_factor, width,
301 filter_width, out_width);
302
303 TF_LITE_ENSURE(context, has_bias);
304
305 // Note that full fixed-point inference requires that all tensors have their
306 // parameters set. This is usually done during quantized training or
307 // calibration.
308 if (input_type != kTfLiteFloat32) {
309 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
310 kTfLiteAffineQuantization);
311 const auto* affine_quantization =
312 reinterpret_cast<TfLiteAffineQuantization*>(
313 filter->quantization.params);
314 TF_LITE_ENSURE(context, affine_quantization);
315 TF_LITE_ENSURE(context, affine_quantization->scale);
316 const int number_channel = affine_quantization->scale->size;
317 data->per_channel_output_multiplier.resize(number_channel);
318 data->per_channel_output_shift.resize(number_channel);
319 TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
320 context, input, filter, bias, output, params->activation,
321 &data->output_multiplier, &data->output_shift,
322 &data->output_activation_min, &data->output_activation_max,
323 data->per_channel_output_multiplier.data(),
324 data->per_channel_output_shift.data()));
325 }
326
327 TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
328 output_size->data[0] = batches;
329 output_size->data[1] = out_height;
330 output_size->data[2] = out_width;
331 output_size->data[3] = channels_out;
332 auto output_status = context->ResizeTensor(context, output, output_size);
333
334 if (output_status != kTfLiteOk) return output_status;
335
336 if (data->need_im2col) {
337 node->temporaries->data[data->im2col_index] = data->im2col_id;
338
339 TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
340
341 int input_depth = input->dims->data[3];
342 im2col_size->data[0] = output_size->data[0];
343 im2col_size->data[1] = output_size->data[1];
344 im2col_size->data[2] = output_size->data[2];
345 im2col_size->data[3] = input_depth * filter_height * filter_width;
346
347 TfLiteTensor* im2col =
348 &context->tensors[node->temporaries->data[data->im2col_index]];
349 im2col->type = input->type;
350 if (is_hybrid) {
351 im2col->type = filter->type;
352 }
353 im2col->allocation_type = kTfLiteArenaRw;
354 auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
355 if (im2col_status != kTfLiteOk) return im2col_status;
356 }
357
358 if (data->need_hwcn_weights) {
359 node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
360 TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
361
362 // Because we're treating the filter weights as a matrix when we do the
363 // transpose, we allocate the buffer with a two-dimensional shape, where one
364 // dimension is the number of elements in each filter, and the second is the
365 // total number of filters.
366 int input_depth = input->dims->data[3];
367 hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
368 hwcn_weights_size->data[1] = channels_out;
369
370 TfLiteTensor* hwcn_weights =
371 &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
372 hwcn_weights->type = input_type;
373 hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
374
375 auto hwcn_weights_status =
376 context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
377 if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
378
379 // TODO(petewarden): If Resize() is called when the size hasn't actually
380 // changed, this will do extra redundant work.
381 data->have_weights_been_transposed = false;
382 }
383
384 if (is_hybrid) {
385 node->temporaries->data[data->input_quantized_index] =
386 data->input_quantized_id;
387 TfLiteTensor* input_quantized =
388 GetTemporary(context, node, data->input_quantized_index);
389 input_quantized->type = kTfLiteInt8;
390 input_quantized->allocation_type = kTfLiteArenaRw;
391 if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
392 TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
393 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
394 input_quantized_size));
395 }
396
397 node->temporaries->data[data->scaling_factors_index] =
398 data->scaling_factors_id;
399 TfLiteTensor* scaling_factors =
400 GetTemporary(context, node, data->scaling_factors_index);
401 scaling_factors->type = kTfLiteFloat32;
402 scaling_factors->allocation_type = kTfLiteArenaRw;
403 // Only one scale factor per batch is typically necessary. See optimized
404 // implementation for why we need to allocate for the height of the inputs
405 // flattened to 2D.
406 const int height = NumElements(input) / channels_in;
407 int scaling_dims[1] = {height};
408 if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
409 TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
410 scaling_factors_size->data[0] = height;
411 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
412 scaling_factors_size));
413 }
414 }
415
416 return kTfLiteOk;
417 }
418
419 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)420 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
421 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
422 TfLiteTensor* filter, TfLiteTensor* bias,
423 TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
424 TfLiteTensor* output) {
425 gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
426
427 auto input_offset = -input->params.zero_point;
428 auto filter_offset = -filter->params.zero_point;
429 auto output_offset = output->params.zero_point;
430
431 KernelType effective_kernel_type;
432 if ((kernel_type == kMultithreadOptimized ||
433 kernel_type == kCblasOptimized) &&
434 (params->dilation_width_factor != 1 ||
435 params->dilation_height_factor != 1)) {
436 // kMultithreadOptimized and kCblasOptimized do not support dilation.
437 // Therefore, fallback to optimized.
438 effective_kernel_type = kGenericOptimized;
439 } else {
440 effective_kernel_type = kernel_type;
441 }
442
443 switch (effective_kernel_type) {
444 case kReference: {
445 ConvParams op_params;
446 op_params.padding_type = PaddingType::kSame;
447 op_params.padding_values.width = data->padding.width;
448 op_params.padding_values.height = data->padding.height;
449 op_params.stride_width = params->stride_width;
450 op_params.stride_height = params->stride_height;
451 op_params.dilation_width_factor = params->dilation_width_factor;
452 op_params.dilation_height_factor = params->dilation_height_factor;
453 op_params.input_offset = input_offset;
454 op_params.weights_offset = filter_offset;
455 op_params.output_offset = output_offset;
456 op_params.output_multiplier = data->output_multiplier;
457 op_params.output_shift = -data->output_shift;
458 op_params.quantized_activation_min = data->output_activation_min;
459 op_params.quantized_activation_max = data->output_activation_max;
460 reference_ops::Conv(
461 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
462 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
463 GetTensorShape(bias), GetTensorData<int32_t>(bias),
464 GetTensorShape(output), GetTensorData<uint8_t>(output),
465 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context);
466 break;
467 }
468 case kGenericOptimized:
469 case kMultithreadOptimized:
470 case kCblasOptimized: {
471 // There is only one optimized implementation for Quantized Conv.
472 ConvParams op_params;
473 op_params.padding_type = PaddingType::kSame;
474 op_params.padding_values.width = data->padding.width;
475 op_params.padding_values.height = data->padding.height;
476 op_params.stride_width = params->stride_width;
477 op_params.stride_height = params->stride_height;
478 op_params.dilation_width_factor = params->dilation_width_factor;
479 op_params.dilation_height_factor = params->dilation_height_factor;
480 op_params.input_offset = input_offset;
481 op_params.weights_offset = filter_offset;
482 op_params.output_offset = output_offset;
483 op_params.output_multiplier = data->output_multiplier;
484 op_params.output_shift = -data->output_shift;
485 op_params.quantized_activation_min = data->output_activation_min;
486 op_params.quantized_activation_max = data->output_activation_max;
487 optimized_ops::Conv(
488 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
489 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
490 GetTensorShape(bias), GetTensorData<int32_t>(bias),
491 GetTensorShape(output), GetTensorData<uint8_t>(output),
492 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context);
493 break;
494 }
495 }
496 }
497
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * output)498 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
499 TfLiteConvParams* params, OpData* data,
500 TfLiteTensor* input, TfLiteTensor* filter,
501 TfLiteTensor* bias, TfLiteTensor* output) {
502 ConvParams op_params;
503 op_params.input_offset = input->params.zero_point;
504 op_params.output_offset = output->params.zero_point;
505 op_params.stride_height = params->stride_height;
506 op_params.stride_width = params->stride_width;
507 op_params.dilation_height_factor = params->dilation_height_factor;
508 op_params.dilation_width_factor = params->dilation_width_factor;
509 op_params.padding_values.height = data->padding.height;
510 op_params.padding_values.width = data->padding.width;
511
512 reference_integer_ops::ConvPerChannel(
513 op_params, data->per_channel_output_multiplier.data(),
514 data->per_channel_output_shift.data(), GetTensorShape(input),
515 GetTensorData<int8>(input), GetTensorShape(filter),
516 GetTensorData<int8>(filter), GetTensorShape(bias),
517 GetTensorData<int32>(bias), GetTensorShape(output),
518 GetTensorData<int8>(output));
519 }
520
521 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)522 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
523 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
524 TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
525 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
526 float output_activation_min, output_activation_max;
527 CalculateActivationRange(params->activation, &output_activation_min,
528 &output_activation_max);
529 KernelType effective_kernel_type;
530 if ((kernel_type == kMultithreadOptimized) &&
531 (params->dilation_width_factor != 1 ||
532 params->dilation_height_factor != 1)) {
533 // kMultithreadOptimized does not support dilation.
534 // Therefore, fallback to optimized.
535 effective_kernel_type = kGenericOptimized;
536 } else {
537 effective_kernel_type = kernel_type;
538 }
539 ConvParams op_params;
540 op_params.padding_type = RuntimePaddingType(params->padding);
541 op_params.padding_values.width = data->padding.width;
542 op_params.padding_values.height = data->padding.height;
543 op_params.stride_width = params->stride_width;
544 op_params.stride_height = params->stride_height;
545 op_params.dilation_width_factor = params->dilation_width_factor;
546 op_params.dilation_height_factor = params->dilation_height_factor;
547 op_params.float_activation_min = output_activation_min;
548 op_params.float_activation_max = output_activation_max;
549 switch (effective_kernel_type) {
550 case kReference: {
551 reference_ops::Conv(op_params, GetTensorShape(input),
552 GetTensorData<float>(input), GetTensorShape(filter),
553 GetTensorData<float>(filter), GetTensorShape(bias),
554 GetTensorData<float>(bias), GetTensorShape(output),
555 GetTensorData<float>(output), GetTensorShape(im2col),
556 GetTensorData<float>(im2col));
557 break;
558 }
559 case kCblasOptimized:
560 case kGenericOptimized: {
561 optimized_ops::Conv(op_params, GetTensorShape(input),
562 GetTensorData<float>(input), GetTensorShape(filter),
563 GetTensorData<float>(filter), GetTensorShape(bias),
564 GetTensorData<float>(bias), GetTensorShape(output),
565 GetTensorData<float>(output), GetTensorShape(im2col),
566 GetTensorData<float>(im2col));
567 break;
568 }
569 case kMultithreadOptimized: {
570 const float* filter_data;
571 if (data->need_hwcn_weights) {
572 filter_data = GetTensorData<float>(hwcn_weights);
573 } else {
574 filter_data = GetTensorData<float>(filter);
575 }
576 multithreaded_ops::Conv(
577 *eigen_support::GetThreadPoolDevice(context), op_params,
578 GetTensorShape(input), GetTensorData<float>(input),
579 GetTensorShape(filter), filter_data, GetTensorShape(bias),
580 GetTensorData<float>(bias), GetTensorShape(output),
581 GetTensorData<float>(output), GetTensorShape(im2col),
582 GetTensorData<float>(im2col));
583 break;
584 }
585 }
586 }
587
588 template <KernelType kernel_type>
EvalHybrid(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)589 void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
590 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
591 TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
592 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
593 float output_activation_min, output_activation_max;
594 CalculateActivationRange(params->activation, &output_activation_min,
595 &output_activation_max);
596
597 const int input_size = NumElements(input) / SizeOfDimension(input, 0);
598 const int batch_size = SizeOfDimension(input, 0);
599
600 const TfLiteTensor* input_quantized =
601 GetTemporary(context, node, data->input_quantized_index);
602 int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
603 float* scaling_factors_ptr =
604 GetTemporary(context, node, data->scaling_factors_index)->data.f;
605
606 // Per-batch input quantization for higher accuracy.
607 for (int b = 0; b < batch_size; ++b) {
608 float unused_min, unused_max;
609 const int offset = b * input_size;
610 tensor_utils::SymmetricQuantizeFloats(
611 input->data.f + offset, input_size, quantized_input_ptr_batch + offset,
612 &unused_min, &unused_max, &scaling_factors_ptr[b]);
613 scaling_factors_ptr[b] *= filter->params.scale;
614 }
615
616 int8_t* im2col_ptr = nullptr;
617 int8_t* filter_ptr = nullptr;
618 if (filter->type == kTfLiteUInt8) {
619 // For backward compatibility, we need to support the case where filters
620 // are quantized to int8 but stored as uint8.
621 if (im2col != nullptr) {
622 im2col_ptr = reinterpret_cast<int8_t*>(im2col->data.uint8);
623 }
624 filter_ptr = reinterpret_cast<int8_t*>(filter->data.uint8);
625 } else {
626 // Code at head uses the int8 type so we do not need to do the cast.
627 if (im2col != nullptr) {
628 im2col_ptr = im2col->data.int8;
629 }
630 filter_ptr = filter->data.int8;
631 }
632
633 switch (kernel_type) {
634 case kReference:
635 case kGenericOptimized:
636 case kMultithreadOptimized:
637 case kCblasOptimized: {
638 // There is only one implementation for hybrid kernel. Note
639 // this does not make use of gemmlowp nor supports multithreading.
640 ConvParams op_params;
641 op_params.padding_type = PaddingType::kSame;
642 op_params.padding_values.width = data->padding.width;
643 op_params.padding_values.height = data->padding.height;
644 op_params.stride_width = params->stride_width;
645 op_params.stride_height = params->stride_height;
646 op_params.dilation_width_factor = 1;
647 op_params.dilation_height_factor = 1;
648 op_params.float_activation_min = output_activation_min;
649 op_params.float_activation_max = output_activation_max;
650 optimized_ops::HybridConv(
651 op_params, scaling_factors_ptr, GetTensorShape(input),
652 quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
653 GetTensorShape(bias), GetTensorData<float>(bias),
654 GetTensorShape(output), GetTensorData<float>(output),
655 GetTensorShape(im2col), im2col_ptr);
656 break;
657 }
658 }
659 }
660
661 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)662 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
663 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
664 OpData* data = reinterpret_cast<OpData*>(node->user_data);
665
666 TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
667 TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
668 TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
669 bool has_bias = node->inputs->size == 3;
670 TfLiteTensor* bias =
671 has_bias ? &context->tensors[node->inputs->data[2]] : nullptr;
672 TfLiteTensor* im2col =
673 data->need_im2col
674 ? &context->tensors[node->temporaries->data[data->im2col_index]]
675 : nullptr;
676 TfLiteTensor* hwcn_weights =
677 data->need_hwcn_weights
678 ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
679 : nullptr;
680
681 if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
682 TransposeFloatTensor(filter, hwcn_weights);
683 data->have_weights_been_transposed = true;
684 }
685
686 // TODO(aselle): Consider whether float conv and quantized conv should be
687 // separate ops to avoid dispatch overhead here.
688 switch (input->type) { // Already know in/outtypes are same.
689 case kTfLiteFloat32:
690 if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
691 EvalHybrid<kernel_type>(context, node, params, data, input, filter,
692 bias, im2col, hwcn_weights, output);
693 } else if (data->run_multithreaded_kernel) {
694 EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
695 im2col, hwcn_weights, output);
696 } else {
697 EvalFloat<kGenericOptimized>(context, node, params, data, input, filter,
698 bias, im2col, hwcn_weights, output);
699 }
700 break;
701 case kTfLiteUInt8:
702 EvalQuantized<kernel_type>(context, node, params, data, input, filter,
703 bias, im2col, hwcn_weights, output);
704 break;
705 case kTfLiteInt8:
706 EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
707 output);
708 break;
709 default:
710 context->ReportError(context, "Type %d not currently supported.",
711 input->type);
712 return kTfLiteError;
713 }
714 return kTfLiteOk;
715 }
716
717 } // namespace conv
718
Register_CONVOLUTION_REF()719 TfLiteRegistration* Register_CONVOLUTION_REF() {
720 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
721 conv::Eval<conv::kReference>};
722 return &r;
723 }
724
Register_CONVOLUTION_GENERIC_OPT()725 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
726 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
727 conv::Eval<conv::kGenericOptimized>};
728 return &r;
729 }
730
Register_CONVOLUTION_MULTITHREADED_OPT()731 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
732 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
733 conv::Eval<conv::kMultithreadOptimized>};
734 return &r;
735 }
736
Register_CONVOLUTION_CBLAS_OPT()737 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
738 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
739 conv::Eval<conv::kCblasOptimized>};
740 return &r;
741 }
742
Register_CONV_2D()743 TfLiteRegistration* Register_CONV_2D() {
744 #ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
745 return Register_CONVOLUTION_CBLAS_OPT();
746 #else
747 return Register_CONVOLUTION_MULTITHREADED_OPT();
748 #endif
749 }
750
751 } // namespace builtin
752 } // namespace ops
753 } // namespace tflite
754