1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <unistd.h>
16 #include <algorithm>
17 #include <cassert>
18 #include <cmath>
19 #include <cstdio>
20 #include <cstdlib>
21 #include <iostream>
22 #include <limits>
23
24 #include "tensorflow/contrib/lite/builtin_op_data.h"
25 #include "tensorflow/contrib/lite/context.h"
26 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
27 #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
28 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
29 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
30 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
31 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
32 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
33 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
34 #include "tensorflow/contrib/lite/kernels/op_macros.h"
35 #include "tensorflow/contrib/lite/kernels/padding.h"
36
37 namespace tflite {
38 namespace ops {
39 namespace builtin {
40 namespace conv {
41
42 // This file has 4 implementation of Conv.
43 enum KernelType {
44 kReference,
45 kGenericOptimized, // Neon-free
46 kMultithreadOptimized,
47 // The kernel uses use CBLAS interface for matrix multiplication.
48 // It's fast when an optimized CBLAS implementation is available (e.g. Apple
49 // Accelerate Framework), and it's slow when falling back to naive
50 // implementation.
51 kCblasOptimized,
52 };
53
54 struct OpData {
55 // IDs are the arbitrary identifiers used by TF Lite to identify and access
56 // memory buffers.
57 int im2col_id;
58 int hwcn_weights_id;
59
60 TfLitePaddingValues padding;
61 // The scaling factor from input to output (aka the 'real multiplier') can
62 // be represented as a fixed point multipler plus a left shift.
63 int32_t output_multiplier;
64 int output_shift;
65 // The range of the fused activation layer. For example for kNone and
66 // uint8_t these would be 0 and 255.
67 int32_t output_activation_min;
68 int32_t output_activation_max;
69 // Indexes are the offset to the memory buffer in the array used to keep track
70 // of the allocated temporaries.
71 int32_t im2col_index;
72 int32_t hwcn_weights_index;
73 bool need_hwcn_weights;
74 bool have_weights_been_transposed;
75 bool need_im2col;
76 };
77
Init(TfLiteContext * context,const char * buffer,size_t length)78 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
79 // This is a builtin op, so we don't use the contents in 'buffer', if any.
80 // Instead, we allocate a new object to use as scratch space for im2col, and
81 // to carry information from Prepare() to Eval().
82 auto* data = new OpData;
83 context->AddTensors(context, 1, &data->im2col_id);
84 context->AddTensors(context, 1, &data->hwcn_weights_id);
85 gemm_support::IncrementUsageCounter(context);
86 return data;
87 }
88
Free(TfLiteContext * context,void * buffer)89 void Free(TfLiteContext* context, void* buffer) {
90 gemm_support::DecrementUsageCounter(context);
91 delete reinterpret_cast<OpData*>(buffer);
92 }
93
94 // Naive implementation of transpose for floats. Could be optimized to be more
95 // cache friendly, but for now it's a one-time cost on first run, and we would
96 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(TfLiteTensor * input,TfLiteTensor * output)97 void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
98 const int rows = output->dims->data[1];
99 const int cols = output->dims->data[0];
100 const float* input_data = GetTensorData<float>(input);
101 float* output_data = GetTensorData<float>(output);
102 for (int i = 0; i < rows; ++i) {
103 for (int j = 0; j < cols; ++j) {
104 const float in_value = input_data[i * cols + j];
105 output_data[j * rows + i] = in_value;
106 }
107 }
108 }
109
Prepare(TfLiteContext * context,TfLiteNode * node)110 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
111 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
112 OpData* data = reinterpret_cast<OpData*>(node->user_data);
113
114 bool hasBias = node->inputs->size == 3;
115 // Check number of inputs/outputs
116 TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
117 TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
118 TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
119 TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
120 TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
121 // Check dimensionality of input, filter
122 TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
123 TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
124 // Check input channels matching filter
125 TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
126
127 // Check types. (We assume that UINT8 refers to quantized tensors)
128 TfLiteType data_type = input->type;
129 TF_LITE_ENSURE(context,
130 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
131 TF_LITE_ENSURE_EQ(context, output->type, data_type);
132 TF_LITE_ENSURE_EQ(context, filter->type, data_type);
133
134 TfLiteTensor* bias = nullptr;
135
136 // TODO(ahentz): At this point the optimized versions require 'bias'. We can
137 // either change that or document that convolution requires it.
138 TF_LITE_ENSURE(context, hasBias);
139
140 if (hasBias) {
141 bias = &context->tensors[node->inputs->data[2]];
142 if (data_type == kTfLiteUInt8) {
143 TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
144 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
145 } else {
146 TF_LITE_ENSURE_EQ(context, bias->type, data_type);
147 }
148 TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
149 TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
150 }
151
152 int channels_out = filter->dims->data[0];
153 int width = input->dims->data[2];
154 int height = input->dims->data[1];
155 int filter_width = filter->dims->data[2];
156 int filter_height = filter->dims->data[1];
157 int batches = input->dims->data[0];
158
159 // Matching GetWindowedOutputSize in TensorFlow.
160 auto padding = params->padding;
161 auto computeOutSize = [padding](int imageSize, int filterSize,
162 int stride) -> int {
163 return padding == kTfLitePaddingSame
164 ? (imageSize + stride - 1) / stride
165 : padding == kTfLitePaddingValid
166 ? (imageSize - filterSize + stride) / stride
167 : 0;
168 };
169
170 int outWidth = computeOutSize(width, filter_width, params->stride_width);
171 int outHeight = computeOutSize(height, filter_height, params->stride_height);
172
173 data->padding.height =
174 ComputePadding(params->stride_height, height, filter_height, outHeight);
175 data->padding.width =
176 ComputePadding(params->stride_width, width, filter_width, outWidth);
177
178 TF_LITE_ENSURE(context, hasBias);
179
180 // Note that quantized inference requires that all tensors have their
181 // parameters set. This is usually done during quantized training.
182 if (data_type != kTfLiteFloat32) {
183 double real_multiplier = 0.0;
184 TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
185 context, input, filter, bias, output, &real_multiplier));
186 QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
187 &data->output_shift);
188 CalculateActivationRangeUint8(params->activation, output,
189 &data->output_activation_min,
190 &data->output_activation_max);
191 }
192
193 TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
194 output_size->data[0] = batches;
195 output_size->data[1] = outHeight;
196 output_size->data[2] = outWidth;
197 output_size->data[3] = channels_out;
198 auto output_status = context->ResizeTensor(context, output, output_size);
199
200 if (output_status != kTfLiteOk) return output_status;
201
202 // We don't always need to allocate im2col. It is only used in some versions
203 // of the optimized Conv. This test just mimics something that happens inside
204 // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
205 data->need_im2col =
206 (params->stride_width != 1 || params->stride_height != 1 ||
207 filter_width != 1 || filter_height != 1);
208 // If we're using the optimized multithreaded EigenTensor implementation of
209 // convolution, it expects the filter weights to be transposed compared to
210 // the normal TF Lite buffer format. Typical TF Lite weights are
211 // [filter_count, filter_height, filter_width, input_depth], but for the float
212 // implementation we need them as [filter_height, filter_width, input_depth,
213 // filter_count]. We get to that format by transposing, and create a temporary
214 // buffer to store the results.
215 // This path is only used for float processing, so only create the buffer if
216 // we're running with that data type.
217 data->need_hwcn_weights = (data_type == kTfLiteFloat32);
218
219 int temporaries_count = 0;
220 if (data->need_im2col) {
221 data->im2col_index = temporaries_count;
222 ++temporaries_count;
223 }
224 if (data->need_hwcn_weights) {
225 data->hwcn_weights_index = temporaries_count;
226 ++temporaries_count;
227 }
228
229 TfLiteIntArrayFree(node->temporaries);
230 node->temporaries = TfLiteIntArrayCreate(temporaries_count);
231
232 if (data->need_im2col) {
233 node->temporaries->data[data->im2col_index] = data->im2col_id;
234
235 TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
236
237 int input_depth = input->dims->data[3];
238 im2col_size->data[0] = output_size->data[0];
239 im2col_size->data[1] = output_size->data[1];
240 im2col_size->data[2] = output_size->data[2];
241 im2col_size->data[3] = input_depth * filter_height * filter_width;
242
243 TfLiteTensor* im2col =
244 &context->tensors[node->temporaries->data[data->im2col_index]];
245 im2col->type = data_type;
246 im2col->allocation_type = kTfLiteArenaRw;
247 auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
248 if (im2col_status != kTfLiteOk) return im2col_status;
249 }
250
251 if (data->need_hwcn_weights) {
252 node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
253 TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
254
255 // Because we're treating the filter weights as a matrix when we do the
256 // transpose, we allocate the buffer with a two-dimensional shape, where one
257 // dimension is the number of elements in each filter, and the second is the
258 // total number of filters.
259 int input_depth = input->dims->data[3];
260 hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
261 hwcn_weights_size->data[1] = channels_out;
262
263 TfLiteTensor* hwcn_weights =
264 &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
265 hwcn_weights->type = data_type;
266 hwcn_weights->allocation_type = kTfLiteDynamic;
267 // Make sure we release any previous allocations before we reallocate.
268 // TODO(petewarden): Persistent arenas would be a better fit for this, but
269 // they aren't fully implemented yet.
270 if (hwcn_weights->data.raw) {
271 free(hwcn_weights->data.raw);
272 hwcn_weights->data.raw = nullptr;
273 }
274
275 // Note that hwcn_weights_status is a kTfLiteDynamic tensor, and
276 // ResizeTensor will actually allocate space for it. The would be more
277 // efficient if we placed hwcn_weights_status in the persistent arena.
278 auto hwcn_weights_status =
279 context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
280 if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
281
282 // TODO(petewarden): If Resize() is called when the size hasn't actually
283 // changed, this will do extra redundant work.
284 data->have_weights_been_transposed = false;
285 }
286
287 return kTfLiteOk;
288 }
289
290 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)291 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
292 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
293 TfLiteTensor* filter, TfLiteTensor* bias,
294 TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
295 TfLiteTensor* output) {
296 gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
297
298 auto input_offset = -input->params.zero_point;
299 auto filter_offset = -filter->params.zero_point;
300 auto output_offset = output->params.zero_point;
301
302 switch (kernel_type) {
303 case kReference:
304 reference_ops::Conv(
305 GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
306 GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
307 GetTensorData<int32_t>(bias), GetTensorDims(bias),
308 params->stride_width, params->stride_height, data->padding.width,
309 data->padding.height, output_offset, data->output_multiplier,
310 data->output_shift, data->output_activation_min,
311 data->output_activation_max, GetTensorData<uint8_t>(output),
312 GetTensorDims(output), GetTensorData<uint8_t>(im2col),
313 GetTensorDims(im2col), gemm_context);
314 break;
315 case kGenericOptimized:
316 case kMultithreadOptimized:
317 case kCblasOptimized:
318 // There is only one optimized implementation for Quantized Conv.
319 optimized_ops::Conv(
320 GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
321 GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
322 GetTensorData<int32_t>(bias), GetTensorDims(bias),
323 params->stride_width, params->stride_height, data->padding.width,
324 data->padding.height, output_offset, data->output_multiplier,
325 data->output_shift, data->output_activation_min,
326 data->output_activation_max, GetTensorData<uint8_t>(output),
327 GetTensorDims(output), GetTensorData<uint8_t>(im2col),
328 GetTensorDims(im2col), gemm_context);
329 break;
330 }
331 }
332
333 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,TfLiteTensor * input,TfLiteTensor * filter,TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)334 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
335 TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
336 TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
337 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
338 float output_activation_min, output_activation_max;
339 CalculateActivationRangeFloat(params->activation, &output_activation_min,
340 &output_activation_max);
341
342 switch (kernel_type) {
343 case kReference: {
344 reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
345 GetTensorData<float>(filter), GetTensorDims(filter),
346 GetTensorData<float>(bias), GetTensorDims(bias),
347 params->stride_width, params->stride_height,
348 data->padding.width, data->padding.height,
349 output_activation_min, output_activation_max,
350 GetTensorData<float>(output), GetTensorDims(output),
351 GetTensorData<float>(im2col), GetTensorDims(im2col));
352 break;
353 }
354 case kGenericOptimized: {
355 optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
356 GetTensorData<float>(filter), GetTensorDims(filter),
357 GetTensorData<float>(bias), GetTensorDims(bias),
358 params->stride_width, params->stride_height,
359 data->padding.width, data->padding.height,
360 output_activation_min, output_activation_max,
361 GetTensorData<float>(output), GetTensorDims(output),
362 GetTensorData<float>(im2col), GetTensorDims(im2col));
363 break;
364 }
365 case kMultithreadOptimized: {
366 const float* filter_data;
367 if (data->need_hwcn_weights) {
368 filter_data = GetTensorData<float>(hwcn_weights);
369 } else {
370 filter_data = GetTensorData<float>(filter);
371 }
372 multithreaded_ops::Conv(
373 GetTensorData<float>(input), GetTensorDims(input), filter_data,
374 GetTensorDims(filter), GetTensorData<float>(bias),
375 GetTensorDims(bias), params->stride_width, params->stride_height,
376 data->padding.width, data->padding.height, params->padding,
377 output_activation_min, output_activation_max,
378 GetTensorData<float>(output), GetTensorDims(output),
379 GetTensorData<float>(im2col), GetTensorDims(im2col));
380 break;
381 }
382 case kCblasOptimized: {
383 cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
384 GetTensorData<float>(filter), GetTensorDims(filter),
385 GetTensorData<float>(bias), GetTensorDims(bias),
386 params->stride_width, params->stride_height,
387 data->padding.width, data->padding.height,
388 output_activation_min, output_activation_max,
389 GetTensorData<float>(output), GetTensorDims(output),
390 GetTensorData<float>(im2col), GetTensorDims(im2col));
391 break;
392 }
393 }
394 }
395
396 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)397 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
398 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
399 OpData* data = reinterpret_cast<OpData*>(node->user_data);
400
401 TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
402 TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
403 TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
404 bool hasBias = node->inputs->size == 3;
405 TfLiteTensor* bias =
406 hasBias ? &context->tensors[node->inputs->data[2]] : nullptr;
407 TfLiteTensor* im2col =
408 data->need_im2col
409 ? &context->tensors[node->temporaries->data[data->im2col_index]]
410 : nullptr;
411 TfLiteTensor* hwcn_weights =
412 data->need_hwcn_weights
413 ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
414 : nullptr;
415
416 if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
417 TransposeFloatTensor(filter, hwcn_weights);
418 data->have_weights_been_transposed = true;
419 }
420
421 // TODO(aselle): Consider whether float conv and quantized conv should be
422 // separate ops to avoid dispatch overhead here.
423 switch (input->type) { // Already know in/outtypes are same.
424 case kTfLiteFloat32:
425 EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
426 im2col, hwcn_weights, output);
427 break;
428 case kTfLiteUInt8:
429 EvalQuantized<kernel_type>(context, node, params, data, input, filter,
430 bias, im2col, hwcn_weights, output);
431 break;
432 default:
433 context->ReportError(context, "Type not currently supported.");
434 return kTfLiteError;
435 }
436 return kTfLiteOk;
437 }
438
439 } // namespace conv
440
Register_CONVOLUTION_REF()441 TfLiteRegistration* Register_CONVOLUTION_REF() {
442 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
443 conv::Eval<conv::kReference>};
444 return &r;
445 }
446
Register_CONVOLUTION_GENERIC_OPT()447 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
448 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
449 conv::Eval<conv::kGenericOptimized>};
450 return &r;
451 }
452
Register_CONVOLUTION_MULTITHREADED_OPT()453 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
454 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
455 conv::Eval<conv::kMultithreadOptimized>};
456 return &r;
457 }
458
Register_CONVOLUTION_CBLAS_OPT()459 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
460 static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
461 conv::Eval<conv::kCblasOptimized>};
462 return &r;
463 }
464
Register_CONV_2D()465 TfLiteRegistration* Register_CONV_2D() {
466 #ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
467 return Register_CONVOLUTION_CBLAS_OPT();
468 #else
469 return Register_CONVOLUTION_MULTITHREADED_OPT();
470 #endif
471 }
472
473 } // namespace builtin
474 } // namespace ops
475 } // namespace tflite
476