1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
16 #include "tensorflow/lite/c/builtin_op_data.h"
17 #include "tensorflow/lite/c/c_api_internal.h"
18 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
19 #include "tensorflow/lite/kernels/internal/quantization_util.h"
20 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
21 #include "tensorflow/lite/kernels/internal/tensor.h"
22 #include "tensorflow/lite/kernels/kernel_util.h"
23 #include "tensorflow/lite/kernels/op_macros.h"
24 
25 namespace tflite {
26 namespace ops {
27 namespace builtin {
28 namespace add {
29 
30 // This file has three implementation of Add.
31 enum KernelType {
32   kReference,
33   kGenericOptimized,  // Neon-free
34   kNeonOptimized,
35 };
36 
37 constexpr int kInputTensor1 = 0;
38 constexpr int kInputTensor2 = 1;
39 constexpr int kOutputTensor = 0;
40 
41 struct OpData {
42   bool requires_broadcast;
43 
44   // These fields are used in both the general 8-bit -> 8bit quantized path,
45   // and the special 16-bit -> 16bit quantized path
46   int input1_shift;
47   int input2_shift;
48   int32 output_activation_min;
49   int32 output_activation_max;
50 
51   // These fields are used only in the general 8-bit -> 8bit quantized path
52   int32 input1_multiplier;
53   int32 input2_multiplier;
54   int32 output_multiplier;
55   int output_shift;
56   int left_shift;
57   int32 input1_offset;
58   int32 input2_offset;
59   int32 output_offset;
60 };
61 
Init(TfLiteContext * context,const char * buffer,size_t length)62 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
63   auto* data = new OpData;
64   data->requires_broadcast = false;
65   return data;
66 }
67 
Free(TfLiteContext * context,void * buffer)68 void Free(TfLiteContext* context, void* buffer) {
69   delete reinterpret_cast<OpData*>(buffer);
70 }
71 
Prepare(TfLiteContext * context,TfLiteNode * node)72 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
73   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
74   OpData* data = reinterpret_cast<OpData*>(node->user_data);
75 
76   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
77   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
78 
79   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
80   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
81   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
82 
83   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
84   output->type = input2->type;
85 
86   data->requires_broadcast = !HaveSameShapes(input1, input2);
87 
88   TfLiteIntArray* output_size = nullptr;
89   if (data->requires_broadcast) {
90     TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
91                                    context, input1, input2, &output_size));
92   } else {
93     output_size = TfLiteIntArrayCopy(input1->dims);
94   }
95 
96   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
97     // 8bit -> 8bit general quantized path, with general rescalings
98     data->input1_offset = -input1->params.zero_point;
99     data->input2_offset = -input2->params.zero_point;
100     data->output_offset = output->params.zero_point;
101     data->left_shift = 20;
102     const double twice_max_input_scale =
103         2 * std::max(input1->params.scale, input2->params.scale);
104     const double real_input1_multiplier =
105         input1->params.scale / twice_max_input_scale;
106     const double real_input2_multiplier =
107         input2->params.scale / twice_max_input_scale;
108     const double real_output_multiplier =
109         twice_max_input_scale /
110         ((1 << data->left_shift) * output->params.scale);
111 
112     QuantizeMultiplierSmallerThanOneExp(
113         real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
114 
115     QuantizeMultiplierSmallerThanOneExp(
116         real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
117 
118     QuantizeMultiplierSmallerThanOneExp(
119         real_output_multiplier, &data->output_multiplier, &data->output_shift);
120 
121     if (output->type == kTfLiteUInt8) {
122       CalculateActivationRangeUint8(params->activation, output,
123                                     &data->output_activation_min,
124                                     &data->output_activation_max);
125     } else {
126       CalculateActivationRangeInt8(params->activation, output,
127                                    &data->output_activation_min,
128                                    &data->output_activation_max);
129     }
130   } else if (output->type == kTfLiteInt16) {
131     // 16bit -> 16bit special quantized path, supporting only a rather
132     // narrow case of quantization parameters: zero_points must all be 0
133     // ("symmetric quantization") and scales must be power-of-two (which
134     // we abbreviate as "POT" below). The intended use case for this path
135     // is in LSTM cells, where, due to the constraints of implementing
136     // some of the math in these LSTM cells in fixed-point arithmetic,
137     // we need to have such symmetric, power-of-two quantization
138     // (Fixed-point formats are inherently symmetric, power-of-two).
139     TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
140     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
141     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
142 
143     int input1_scale_log2_rounded;
144     bool input1_scale_is_pot =
145         CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
146     TF_LITE_ENSURE(context, input1_scale_is_pot);
147 
148     int input2_scale_log2_rounded;
149     bool input2_scale_is_pot =
150         CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
151     TF_LITE_ENSURE(context, input2_scale_is_pot);
152 
153     int output_scale_log2_rounded;
154     bool output_scale_is_pot =
155         CheckedLog2(output->params.scale, &output_scale_log2_rounded);
156     TF_LITE_ENSURE(context, output_scale_is_pot);
157 
158     data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
159     data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
160 
161     // Shifting of one input is supported. The graph quantization should ensure
162     // that the other input matches the output.
163     TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
164     TF_LITE_ENSURE(context, data->input1_shift <= 0);
165     TF_LITE_ENSURE(context, data->input2_shift <= 0);
166 
167     CalculateActivationRangeQuantized(context, params->activation, output,
168                                       &data->output_activation_min,
169                                       &data->output_activation_max);
170   }
171 
172   return context->ResizeTensor(context, output, output_size);
173 }
174 
175 template <KernelType kernel_type>
EvalAdd(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)176 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
177              const OpData* data, const TfLiteTensor* input1,
178              const TfLiteTensor* input2, TfLiteTensor* output) {
179 #define TF_LITE_ADD(type, opname, data_type)                             \
180   data_type output_activation_min, output_activation_max;                \
181   CalculateActivationRange(params->activation, &output_activation_min,   \
182                            &output_activation_max);                      \
183   tflite::ArithmeticParams op_params;                                    \
184   SetActivationParams(output_activation_min, output_activation_max,      \
185                       &op_params);                                       \
186   type::opname(op_params, GetTensorShape(input1),                        \
187                GetTensorData<data_type>(input1), GetTensorShape(input2), \
188                GetTensorData<data_type>(input2), GetTensorShape(output), \
189                GetTensorData<data_type>(output))
190   if (output->type == kTfLiteInt32) {
191     if (kernel_type == kReference) {
192       if (data->requires_broadcast) {
193         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
194       } else {
195         TF_LITE_ADD(reference_ops, Add, int32_t);
196       }
197     } else {
198       if (data->requires_broadcast) {
199         TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
200       } else {
201         TF_LITE_ADD(optimized_ops, Add, int32_t);
202       }
203     }
204   } else if (output->type == kTfLiteFloat32) {
205     if (kernel_type == kReference) {
206       if (data->requires_broadcast) {
207         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
208       } else {
209         TF_LITE_ADD(reference_ops, Add, float);
210       }
211     } else {
212       if (data->requires_broadcast) {
213         TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, float);
214       } else {
215         TF_LITE_ADD(optimized_ops, Add, float);
216       }
217     }
218   }
219 #undef TF_LITE_ADD
220 }
221 
222 template <KernelType kernel_type>
EvalAddQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)223 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
224                               TfLiteAddParams* params, const OpData* data,
225                               const TfLiteTensor* input1,
226                               const TfLiteTensor* input2,
227                               TfLiteTensor* output) {
228   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
229     tflite::ArithmeticParams op_params;
230     op_params.left_shift = data->left_shift;
231     op_params.input1_offset = data->input1_offset;
232     op_params.input1_multiplier = data->input1_multiplier;
233     op_params.input1_shift = data->input1_shift;
234     op_params.input2_offset = data->input2_offset;
235     op_params.input2_multiplier = data->input2_multiplier;
236     op_params.input2_shift = data->input2_shift;
237     op_params.output_offset = data->output_offset;
238     op_params.output_multiplier = data->output_multiplier;
239     op_params.output_shift = data->output_shift;
240     SetActivationParams(data->output_activation_min,
241                         data->output_activation_max, &op_params);
242     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
243         GetTensorShape(input1), GetTensorShape(input2), &op_params);
244 #define TF_LITE_ADD(type, opname, dtype)                             \
245   type::opname(op_params, GetTensorShape(input1),                    \
246                GetTensorData<dtype>(input1), GetTensorShape(input2), \
247                GetTensorData<dtype>(input2), GetTensorShape(output), \
248                GetTensorData<dtype>(output));
249     if (output->type == kTfLiteInt8) {
250       if (need_broadcast) {
251         TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
252       } else {
253         TF_LITE_ADD(reference_integer_ops, Add, int8_t);
254       }
255     } else {
256       if (kernel_type == kReference) {
257         if (need_broadcast) {
258           TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
259         } else {
260           TF_LITE_ADD(reference_ops, Add, uint8_t);
261         }
262       } else {
263         if (op_params.broadcast_category ==
264             BroadcastableOpCategory::kGenericBroadcast) {
265           TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, uint8_t);
266         } else if (need_broadcast) {
267           TF_LITE_ADD(optimized_ops, BroadcastAddFivefold, uint8_t);
268         } else {
269           TF_LITE_ADD(optimized_ops, Add, uint8_t);
270         }
271       }
272     }
273 #undef TF_LITE_ADD
274   } else if (output->type == kTfLiteInt16) {
275 #define TF_LITE_ADD(type, opname)                                      \
276   tflite::ArithmeticParams op_params;                                  \
277   op_params.input1_shift = data->input1_shift;                         \
278   op_params.input2_shift = data->input2_shift;                         \
279   SetActivationParams(data->output_activation_min,                     \
280                       data->output_activation_max, &op_params);        \
281   type::opname(op_params, GetTensorShape(input1),                      \
282                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
283                GetTensorData<int16_t>(input2), GetTensorShape(output), \
284                GetTensorData<int16_t>(output))
285     // The quantized version of Add doesn't support activations, so we
286     // always use BroadcastAdd.
287     if (kernel_type == kReference) {
288       TF_LITE_ADD(reference_ops, Add);
289     } else {
290       TF_LITE_ADD(optimized_ops, Add);
291     }
292 #undef TF_LITE_ADD
293   }
294 
295   return kTfLiteOk;
296 }
297 
298 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)299 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
300   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
301   OpData* data = reinterpret_cast<OpData*>(node->user_data);
302 
303   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
304   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
305   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
306 
307   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
308     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
309   } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
310              output->type == kTfLiteInt16) {
311     TF_LITE_ENSURE_OK(context,
312                       EvalAddQuantized<kernel_type>(context, node, params, data,
313                                                     input1, input2, output));
314   } else {
315     context->ReportError(context,
316                          "Inputs and outputs not all float|uint8|int16 types.");
317     return kTfLiteError;
318   }
319 
320   return kTfLiteOk;
321 }
322 
323 }  // namespace add
324 
Register_ADD_REF()325 TfLiteRegistration* Register_ADD_REF() {
326   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
327                                  add::Eval<add::kReference>};
328   return &r;
329 }
330 
Register_ADD_GENERIC_OPT()331 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
332   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
333                                  add::Eval<add::kGenericOptimized>};
334   return &r;
335 }
336 
Register_ADD_NEON_OPT()337 TfLiteRegistration* Register_ADD_NEON_OPT() {
338   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
339                                  add::Eval<add::kNeonOptimized>};
340   return &r;
341 }
342 
Register_ADD()343 TfLiteRegistration* Register_ADD() {
344 #ifdef USE_NEON
345   return Register_ADD_NEON_OPT();
346 #else
347   return Register_ADD_GENERIC_OPT();
348 #endif
349 }
350 
351 }  // namespace builtin
352 }  // namespace ops
353 }  // namespace tflite
354