1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
16 #include "tensorflow/lite/c/builtin_op_data.h"
17 #include "tensorflow/lite/c/c_api_internal.h"
18 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
19 #include "tensorflow/lite/kernels/internal/quantization_util.h"
20 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
21 #include "tensorflow/lite/kernels/internal/tensor.h"
22 #include "tensorflow/lite/kernels/kernel_util.h"
23 #include "tensorflow/lite/kernels/op_macros.h"
24
25 namespace tflite {
26 namespace ops {
27 namespace builtin {
28 namespace add {
29
30 // This file has three implementation of Add.
31 enum KernelType {
32 kReference,
33 kGenericOptimized, // Neon-free
34 kNeonOptimized,
35 };
36
37 constexpr int kInputTensor1 = 0;
38 constexpr int kInputTensor2 = 1;
39 constexpr int kOutputTensor = 0;
40
41 struct OpData {
42 bool requires_broadcast;
43
44 // These fields are used in both the general 8-bit -> 8bit quantized path,
45 // and the special 16-bit -> 16bit quantized path
46 int input1_shift;
47 int input2_shift;
48 int32 output_activation_min;
49 int32 output_activation_max;
50
51 // These fields are used only in the general 8-bit -> 8bit quantized path
52 int32 input1_multiplier;
53 int32 input2_multiplier;
54 int32 output_multiplier;
55 int output_shift;
56 int left_shift;
57 int32 input1_offset;
58 int32 input2_offset;
59 int32 output_offset;
60 };
61
Init(TfLiteContext * context,const char * buffer,size_t length)62 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
63 auto* data = new OpData;
64 data->requires_broadcast = false;
65 return data;
66 }
67
Free(TfLiteContext * context,void * buffer)68 void Free(TfLiteContext* context, void* buffer) {
69 delete reinterpret_cast<OpData*>(buffer);
70 }
71
Prepare(TfLiteContext * context,TfLiteNode * node)72 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
73 auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
74 OpData* data = reinterpret_cast<OpData*>(node->user_data);
75
76 TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
77 TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
78
79 const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
80 const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
81 TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
82
83 TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
84 output->type = input2->type;
85
86 data->requires_broadcast = !HaveSameShapes(input1, input2);
87
88 TfLiteIntArray* output_size = nullptr;
89 if (data->requires_broadcast) {
90 TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
91 context, input1, input2, &output_size));
92 } else {
93 output_size = TfLiteIntArrayCopy(input1->dims);
94 }
95
96 if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
97 // 8bit -> 8bit general quantized path, with general rescalings
98 data->input1_offset = -input1->params.zero_point;
99 data->input2_offset = -input2->params.zero_point;
100 data->output_offset = output->params.zero_point;
101 data->left_shift = 20;
102 const double twice_max_input_scale =
103 2 * std::max(input1->params.scale, input2->params.scale);
104 const double real_input1_multiplier =
105 input1->params.scale / twice_max_input_scale;
106 const double real_input2_multiplier =
107 input2->params.scale / twice_max_input_scale;
108 const double real_output_multiplier =
109 twice_max_input_scale /
110 ((1 << data->left_shift) * output->params.scale);
111
112 QuantizeMultiplierSmallerThanOneExp(
113 real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
114
115 QuantizeMultiplierSmallerThanOneExp(
116 real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
117
118 QuantizeMultiplierSmallerThanOneExp(
119 real_output_multiplier, &data->output_multiplier, &data->output_shift);
120
121 if (output->type == kTfLiteUInt8) {
122 CalculateActivationRangeUint8(params->activation, output,
123 &data->output_activation_min,
124 &data->output_activation_max);
125 } else {
126 CalculateActivationRangeInt8(params->activation, output,
127 &data->output_activation_min,
128 &data->output_activation_max);
129 }
130 } else if (output->type == kTfLiteInt16) {
131 // 16bit -> 16bit special quantized path, supporting only a rather
132 // narrow case of quantization parameters: zero_points must all be 0
133 // ("symmetric quantization") and scales must be power-of-two (which
134 // we abbreviate as "POT" below). The intended use case for this path
135 // is in LSTM cells, where, due to the constraints of implementing
136 // some of the math in these LSTM cells in fixed-point arithmetic,
137 // we need to have such symmetric, power-of-two quantization
138 // (Fixed-point formats are inherently symmetric, power-of-two).
139 TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
140 TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
141 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
142
143 int input1_scale_log2_rounded;
144 bool input1_scale_is_pot =
145 CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
146 TF_LITE_ENSURE(context, input1_scale_is_pot);
147
148 int input2_scale_log2_rounded;
149 bool input2_scale_is_pot =
150 CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
151 TF_LITE_ENSURE(context, input2_scale_is_pot);
152
153 int output_scale_log2_rounded;
154 bool output_scale_is_pot =
155 CheckedLog2(output->params.scale, &output_scale_log2_rounded);
156 TF_LITE_ENSURE(context, output_scale_is_pot);
157
158 data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
159 data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
160
161 // Shifting of one input is supported. The graph quantization should ensure
162 // that the other input matches the output.
163 TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
164 TF_LITE_ENSURE(context, data->input1_shift <= 0);
165 TF_LITE_ENSURE(context, data->input2_shift <= 0);
166
167 CalculateActivationRangeQuantized(context, params->activation, output,
168 &data->output_activation_min,
169 &data->output_activation_max);
170 }
171
172 return context->ResizeTensor(context, output, output_size);
173 }
174
175 template <KernelType kernel_type>
EvalAdd(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)176 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
177 const OpData* data, const TfLiteTensor* input1,
178 const TfLiteTensor* input2, TfLiteTensor* output) {
179 #define TF_LITE_ADD(type, opname, data_type) \
180 data_type output_activation_min, output_activation_max; \
181 CalculateActivationRange(params->activation, &output_activation_min, \
182 &output_activation_max); \
183 tflite::ArithmeticParams op_params; \
184 SetActivationParams(output_activation_min, output_activation_max, \
185 &op_params); \
186 type::opname(op_params, GetTensorShape(input1), \
187 GetTensorData<data_type>(input1), GetTensorShape(input2), \
188 GetTensorData<data_type>(input2), GetTensorShape(output), \
189 GetTensorData<data_type>(output))
190 if (output->type == kTfLiteInt32) {
191 if (kernel_type == kReference) {
192 if (data->requires_broadcast) {
193 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
194 } else {
195 TF_LITE_ADD(reference_ops, Add, int32_t);
196 }
197 } else {
198 if (data->requires_broadcast) {
199 TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
200 } else {
201 TF_LITE_ADD(optimized_ops, Add, int32_t);
202 }
203 }
204 } else if (output->type == kTfLiteFloat32) {
205 if (kernel_type == kReference) {
206 if (data->requires_broadcast) {
207 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
208 } else {
209 TF_LITE_ADD(reference_ops, Add, float);
210 }
211 } else {
212 if (data->requires_broadcast) {
213 TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, float);
214 } else {
215 TF_LITE_ADD(optimized_ops, Add, float);
216 }
217 }
218 }
219 #undef TF_LITE_ADD
220 }
221
222 template <KernelType kernel_type>
EvalAddQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)223 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
224 TfLiteAddParams* params, const OpData* data,
225 const TfLiteTensor* input1,
226 const TfLiteTensor* input2,
227 TfLiteTensor* output) {
228 if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
229 tflite::ArithmeticParams op_params;
230 op_params.left_shift = data->left_shift;
231 op_params.input1_offset = data->input1_offset;
232 op_params.input1_multiplier = data->input1_multiplier;
233 op_params.input1_shift = data->input1_shift;
234 op_params.input2_offset = data->input2_offset;
235 op_params.input2_multiplier = data->input2_multiplier;
236 op_params.input2_shift = data->input2_shift;
237 op_params.output_offset = data->output_offset;
238 op_params.output_multiplier = data->output_multiplier;
239 op_params.output_shift = data->output_shift;
240 SetActivationParams(data->output_activation_min,
241 data->output_activation_max, &op_params);
242 bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
243 GetTensorShape(input1), GetTensorShape(input2), &op_params);
244 #define TF_LITE_ADD(type, opname, dtype) \
245 type::opname(op_params, GetTensorShape(input1), \
246 GetTensorData<dtype>(input1), GetTensorShape(input2), \
247 GetTensorData<dtype>(input2), GetTensorShape(output), \
248 GetTensorData<dtype>(output));
249 if (output->type == kTfLiteInt8) {
250 if (need_broadcast) {
251 TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
252 } else {
253 TF_LITE_ADD(reference_integer_ops, Add, int8_t);
254 }
255 } else {
256 if (kernel_type == kReference) {
257 if (need_broadcast) {
258 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
259 } else {
260 TF_LITE_ADD(reference_ops, Add, uint8_t);
261 }
262 } else {
263 if (op_params.broadcast_category ==
264 BroadcastableOpCategory::kGenericBroadcast) {
265 TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, uint8_t);
266 } else if (need_broadcast) {
267 TF_LITE_ADD(optimized_ops, BroadcastAddFivefold, uint8_t);
268 } else {
269 TF_LITE_ADD(optimized_ops, Add, uint8_t);
270 }
271 }
272 }
273 #undef TF_LITE_ADD
274 } else if (output->type == kTfLiteInt16) {
275 #define TF_LITE_ADD(type, opname) \
276 tflite::ArithmeticParams op_params; \
277 op_params.input1_shift = data->input1_shift; \
278 op_params.input2_shift = data->input2_shift; \
279 SetActivationParams(data->output_activation_min, \
280 data->output_activation_max, &op_params); \
281 type::opname(op_params, GetTensorShape(input1), \
282 GetTensorData<int16_t>(input1), GetTensorShape(input2), \
283 GetTensorData<int16_t>(input2), GetTensorShape(output), \
284 GetTensorData<int16_t>(output))
285 // The quantized version of Add doesn't support activations, so we
286 // always use BroadcastAdd.
287 if (kernel_type == kReference) {
288 TF_LITE_ADD(reference_ops, Add);
289 } else {
290 TF_LITE_ADD(optimized_ops, Add);
291 }
292 #undef TF_LITE_ADD
293 }
294
295 return kTfLiteOk;
296 }
297
298 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)299 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
300 auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
301 OpData* data = reinterpret_cast<OpData*>(node->user_data);
302
303 const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
304 const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
305 TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
306
307 if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
308 EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
309 } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
310 output->type == kTfLiteInt16) {
311 TF_LITE_ENSURE_OK(context,
312 EvalAddQuantized<kernel_type>(context, node, params, data,
313 input1, input2, output));
314 } else {
315 context->ReportError(context,
316 "Inputs and outputs not all float|uint8|int16 types.");
317 return kTfLiteError;
318 }
319
320 return kTfLiteOk;
321 }
322
323 } // namespace add
324
Register_ADD_REF()325 TfLiteRegistration* Register_ADD_REF() {
326 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
327 add::Eval<add::kReference>};
328 return &r;
329 }
330
Register_ADD_GENERIC_OPT()331 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
332 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
333 add::Eval<add::kGenericOptimized>};
334 return &r;
335 }
336
Register_ADD_NEON_OPT()337 TfLiteRegistration* Register_ADD_NEON_OPT() {
338 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
339 add::Eval<add::kNeonOptimized>};
340 return &r;
341 }
342
Register_ADD()343 TfLiteRegistration* Register_ADD() {
344 #ifdef USE_NEON
345 return Register_ADD_NEON_OPT();
346 #else
347 return Register_ADD_GENERIC_OPT();
348 #endif
349 }
350
351 } // namespace builtin
352 } // namespace ops
353 } // namespace tflite
354