1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
17 
18 #include <stdint.h>
19 #include <sys/types.h>
20 #include <algorithm>
21 #include <cmath>
22 #include <functional>
23 #include <limits>
24 #include <memory>
25 #include <type_traits>
26 
27 #include "fixedpoint/fixedpoint.h"
28 #include "public/gemmlowp.h"
29 #include "tensorflow/lite/kernels/internal/common.h"
30 #include "tensorflow/lite/kernels/internal/quantization_util.h"
31 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
32 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
33 #include "tensorflow/lite/kernels/internal/round.h"
34 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
35 #include "tensorflow/lite/kernels/internal/types.h"
36 
37 namespace tflite {
38 
39 namespace reference_ops {
40 
41 // Return true for broadcast case, false otherwise.
ProcessBroadcastShapes(const RuntimeShape & shape0,const RuntimeShape & shape1,tflite::ArithmeticParams * params)42 inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
43                                    const RuntimeShape& shape1,
44                                    tflite::ArithmeticParams* params) {
45   const int dims_count =
46       std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
47 
48   params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
49   RuntimeShape scalar_shape(dims_count, 1);
50 
51   auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
52   auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
53 
54   // Check for "exact" match, implicitly accepting any scalar shapes.
55   if (extended_shape0 == extended_shape1) {
56     params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
57     return false;
58   }
59 
60   for (int i = dims_count - 1; i >= 0; --i) {
61     if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
62       continue;
63     } else if (extended_shape0.Dims(i) == 1) {
64       params->broadcast_category =
65           BroadcastableOpCategory::kFirstInputBroadcastsFast;
66       break;
67     } else if (extended_shape1.Dims(i) == 1) {
68       params->broadcast_category =
69           BroadcastableOpCategory::kSecondInputBroadcastsFast;
70       break;
71     } else {
72       params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
73       break;
74     }
75   }
76 
77   if (params->broadcast_category !=
78           BroadcastableOpCategory::kFirstInputBroadcastsFast &&
79       params->broadcast_category !=
80           BroadcastableOpCategory::kSecondInputBroadcastsFast) {
81     return false;
82   }
83 
84   // From this point it is assumed contractually that corresponding dimensions
85   // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
86   const bool swap_inputs = params->broadcast_category ==
87                            BroadcastableOpCategory::kSecondInputBroadcastsFast;
88   const RuntimeShape* shape_a =
89       swap_inputs ? &extended_shape1 : &extended_shape0;
90   const RuntimeShape* shape_b =
91       swap_inputs ? &extended_shape0 : &extended_shape1;
92 
93   int i = dims_count - 1;
94   params->broadcast_shape[0] = 1;
95   params->broadcast_shape[1] = 1;
96   params->broadcast_shape[2] = 1;
97   params->broadcast_shape[3] = 1;
98   params->broadcast_shape[4] = 1;
99   // y_0 is greedy: include dims if both or neither equal 1: in other words,
100   // test for equality rather than (shape_a->Dims(i) != 1).
101   while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
102     params->broadcast_shape[4] *= shape_b->Dims(i);
103     --i;
104   }
105   // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
106   // that has the unit dimension, the next two loops are not entered.
107   while (i >= 0 && shape_a->Dims(i) == 1) {
108     params->broadcast_shape[3] *= shape_b->Dims(i);
109     --i;
110   }
111   while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
112     params->broadcast_shape[2] *= shape_a->Dims(i);
113     --i;
114   }
115   // Here either input_a or input_b has dim of 1 (if i >= 0).
116   while (i >= 0 && shape_b->Dims(i) == 1) {
117     params->broadcast_shape[1] *= shape_a->Dims(i);
118     --i;
119   }
120   while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
121     params->broadcast_shape[0] *= shape_b->Dims(i);
122     --i;
123   }
124 
125   // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
126   // loop.
127   if (i >= 0) {
128     params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
129   }
130   return true;
131 }
132 
Conv(const ConvParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & filter_shape,const float * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const RuntimeShape & im2col_shape,float * im2col_data)133 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
134                  const float* input_data, const RuntimeShape& filter_shape,
135                  const float* filter_data, const RuntimeShape& bias_shape,
136                  const float* bias_data, const RuntimeShape& output_shape,
137                  float* output_data, const RuntimeShape& im2col_shape,
138                  float* im2col_data) {
139   const int stride_width = params.stride_width;
140   const int stride_height = params.stride_height;
141   const int dilation_width_factor = params.dilation_width_factor;
142   const int dilation_height_factor = params.dilation_height_factor;
143   const int pad_width = params.padding_values.width;
144   const int pad_height = params.padding_values.height;
145   const float output_activation_min = params.float_activation_min;
146   const float output_activation_max = params.float_activation_max;
147   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
148   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
149   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
150 
151   (void)im2col_data;   // only used in optimized code.
152   (void)im2col_shape;  // only used in optimized code.
153   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
154   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
155   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
156   if (bias_data) {
157     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
158   }
159   const int input_height = input_shape.Dims(1);
160   const int input_width = input_shape.Dims(2);
161   const int filter_height = filter_shape.Dims(1);
162   const int filter_width = filter_shape.Dims(2);
163   const int output_height = output_shape.Dims(1);
164   const int output_width = output_shape.Dims(2);
165   for (int batch = 0; batch < batches; ++batch) {
166     for (int out_y = 0; out_y < output_height; ++out_y) {
167       for (int out_x = 0; out_x < output_width; ++out_x) {
168         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
169           const int in_x_origin = (out_x * stride_width) - pad_width;
170           const int in_y_origin = (out_y * stride_height) - pad_height;
171           float total = 0.f;
172           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
173             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
174               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
175                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
176                 const int in_y =
177                     in_y_origin + dilation_height_factor * filter_y;
178                 // If the location is outside the bounds of the input image,
179                 // use zero as a default value.
180                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
181                     (in_y < input_height)) {
182                   float input_value = input_data[Offset(
183                       input_shape, batch, in_y, in_x, in_channel)];
184                   float filter_value =
185                       filter_data[Offset(filter_shape, out_channel, filter_y,
186                                          filter_x, in_channel)];
187                   total += (input_value * filter_value);
188                 }
189               }
190             }
191           }
192           float bias_value = 0.0f;
193           if (bias_data) {
194             bias_value = bias_data[out_channel];
195           }
196           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
197               ActivationFunctionWithMinMax(total + bias_value,
198                                            output_activation_min,
199                                            output_activation_max);
200         }
201       }
202     }
203   }
204 }
205 
Conv(const ConvParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & filter_shape,const uint8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,uint8 * output_data,const RuntimeShape & im2col_shape,uint8 * im2col_data,gemmlowp::GemmContext * gemm_context)206 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
207                  const uint8* input_data, const RuntimeShape& filter_shape,
208                  const uint8* filter_data, const RuntimeShape& bias_shape,
209                  const int32* bias_data, const RuntimeShape& output_shape,
210                  uint8* output_data, const RuntimeShape& im2col_shape,
211                  uint8* im2col_data, gemmlowp::GemmContext* gemm_context) {
212   (void)im2col_data;   // only used in optimized code.
213   (void)im2col_shape;  // only used in optimized code.
214   (void)gemm_context;  // only used in optimized code.
215   const int stride_width = params.stride_width;
216   const int stride_height = params.stride_height;
217   const int dilation_width_factor = params.dilation_width_factor;
218   const int dilation_height_factor = params.dilation_height_factor;
219   const int pad_width = params.padding_values.width;
220   const int pad_height = params.padding_values.height;
221   const int32 input_offset = params.input_offset;
222   const int32 filter_offset = params.weights_offset;
223   const int32 output_offset = params.output_offset;
224   const int32 output_multiplier = params.output_multiplier;
225   const int output_shift = params.output_shift;
226   const int32 output_activation_min = params.quantized_activation_min;
227   const int32 output_activation_max = params.quantized_activation_max;
228   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
229 
230   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
231   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
232   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
233   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
234   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
235   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
236   if (bias_data) {
237     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
238   }
239   const int input_height = input_shape.Dims(1);
240   const int input_width = input_shape.Dims(2);
241   const int filter_height = filter_shape.Dims(1);
242   const int filter_width = filter_shape.Dims(2);
243   const int output_height = output_shape.Dims(1);
244   const int output_width = output_shape.Dims(2);
245   for (int batch = 0; batch < batches; ++batch) {
246     for (int out_y = 0; out_y < output_height; ++out_y) {
247       for (int out_x = 0; out_x < output_width; ++out_x) {
248         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
249           const int in_x_origin = (out_x * stride_width) - pad_width;
250           const int in_y_origin = (out_y * stride_height) - pad_height;
251           int32 acc = 0;
252           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
253             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
254               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
255                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
256                 const int in_y =
257                     in_y_origin + dilation_height_factor * filter_y;
258                 // If the location is outside the bounds of the input image,
259                 // use zero as a default value.
260                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
261                     (in_y < input_height)) {
262                   int32 input_val = input_data[Offset(input_shape, batch, in_y,
263                                                       in_x, in_channel)];
264                   int32 filter_val =
265                       filter_data[Offset(filter_shape, out_channel, filter_y,
266                                          filter_x, in_channel)];
267                   acc +=
268                       (filter_val + filter_offset) * (input_val + input_offset);
269                 }
270               }
271             }
272           }
273           if (bias_data) {
274             acc += bias_data[out_channel];
275           }
276           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
277                                               output_shift);
278           acc += output_offset;
279           acc = std::max(acc, output_activation_min);
280           acc = std::min(acc, output_activation_max);
281           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
282               static_cast<uint8>(acc);
283         }
284       }
285     }
286   }
287 }
288 
289 template <typename T>
DepthToSpace(const tflite::DepthToSpaceParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)290 inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
291                          const RuntimeShape& unextended_input_shape,
292                          const T* input_data,
293                          const RuntimeShape& unextended_output_shape,
294                          T* output_data) {
295   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
296   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
297   const RuntimeShape input_shape =
298       RuntimeShape::ExtendedShape(4, unextended_input_shape);
299   const RuntimeShape output_shape =
300       RuntimeShape::ExtendedShape(4, unextended_output_shape);
301 
302   const int input_depth = input_shape.Dims(3);
303   const int input_width = input_shape.Dims(2);
304   const int input_height = input_shape.Dims(1);
305   const int input_batch = input_shape.Dims(0);
306 
307   const int output_depth = output_shape.Dims(3);
308   const int output_width = output_shape.Dims(2);
309   const int output_height = output_shape.Dims(1);
310   const int output_batch = output_shape.Dims(0);
311 
312   const int32 block_size = op_params.block_size;
313 
314   TFLITE_DCHECK_EQ(input_width * block_size, output_width);
315   TFLITE_DCHECK_EQ(input_height * block_size, output_height);
316   TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
317   TFLITE_DCHECK_EQ(input_batch, output_batch);
318 
319   for (int out_b = 0; out_b < output_batch; ++out_b) {
320     for (int out_h = 0; out_h < output_height; ++out_h) {
321       for (int out_w = 0; out_w < output_width; ++out_w) {
322         for (int out_d = 0; out_d < output_depth; ++out_d) {
323           const int in_d =
324               out_d + ((out_h % block_size) * block_size + out_w % block_size) *
325                           output_depth;
326 
327           const int in_w = out_w / block_size;
328           const int in_h = out_h / block_size;
329           const int in_b = out_b;
330 
331           const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
332           const int output_index =
333               Offset(output_shape, out_b, out_h, out_w, out_d);
334 
335           output_data[output_index] = input_data[input_index];
336         }
337       }
338     }
339   }
340 }
341 
342 template <typename T>
SpaceToDepth(const tflite::SpaceToDepthParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)343 inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
344                          const RuntimeShape& unextended_input_shape,
345                          const T* input_data,
346                          const RuntimeShape& unextended_output_shape,
347                          T* output_data) {
348   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
349   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
350   const RuntimeShape input_shape =
351       RuntimeShape::ExtendedShape(4, unextended_input_shape);
352   const RuntimeShape output_shape =
353       RuntimeShape::ExtendedShape(4, unextended_output_shape);
354 
355   const int input_depth = input_shape.Dims(3);
356   const int input_width = input_shape.Dims(2);
357   const int input_height = input_shape.Dims(1);
358   const int input_batch = input_shape.Dims(0);
359 
360   const int output_depth = output_shape.Dims(3);
361   const int output_width = output_shape.Dims(2);
362   const int output_height = output_shape.Dims(1);
363   const int output_batch = output_shape.Dims(0);
364 
365   const int32 block_size = op_params.block_size;
366 
367   TFLITE_DCHECK_EQ(input_width, output_width * block_size);
368   TFLITE_DCHECK_EQ(input_height, output_height * block_size);
369   TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
370   TFLITE_DCHECK_EQ(input_batch, output_batch);
371 
372   for (int in_b = 0; in_b < input_batch; ++in_b) {
373     for (int in_h = 0; in_h < input_height; ++in_h) {
374       for (int in_w = 0; in_w < input_width; ++in_w) {
375         for (int in_d = 0; in_d < input_depth; ++in_d) {
376           const int out_d =
377               in_d + ((in_h % block_size) * block_size + in_w % block_size) *
378                          input_depth;
379           const int out_w = in_w / block_size;
380           const int out_h = in_h / block_size;
381           const int out_b = in_b;
382 
383           const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
384           const int output_index =
385               Offset(output_shape, out_b, out_h, out_w, out_d);
386 
387           output_data[output_index] = input_data[input_index];
388         }
389       }
390     }
391   }
392 }
393 
Elu(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)394 inline void Elu(const RuntimeShape& input_shape, const float* input_data,
395                 const RuntimeShape& output_shape, float* output_data) {
396   const int flat_size = MatchingFlatSize(input_shape, output_shape);
397   for (int i = 0; i < flat_size; ++i) {
398     const float val = input_data[i];
399     output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
400   }
401 }
402 
Relu(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)403 inline void Relu(const RuntimeShape& input_shape, const float* input_data,
404                  const RuntimeShape& output_shape, float* output_data) {
405   const int flat_size = MatchingFlatSize(input_shape, output_shape);
406   for (int i = 0; i < flat_size; ++i) {
407     const float val = input_data[i];
408     const float lower = 0;
409     const float clamped = val < lower ? lower : val;
410     output_data[i] = clamped;
411   }
412 }
413 
Relu1(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)414 inline void Relu1(const RuntimeShape& input_shape, const float* input_data,
415                   const RuntimeShape& output_shape, float* output_data) {
416   gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
417   const int flat_size = MatchingFlatSize(input_shape, output_shape);
418   for (int i = 0; i < flat_size; ++i) {
419     const float val = input_data[i];
420     const float upper = 1;
421     const float lower = -1;
422     const float clamped = val > upper ? upper : val < lower ? lower : val;
423     output_data[i] = clamped;
424   }
425 }
426 
Relu6(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)427 inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
428                   const RuntimeShape& output_shape, float* output_data) {
429   gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
430   const int flat_size = MatchingFlatSize(input_shape, output_shape);
431   for (int i = 0; i < flat_size; ++i) {
432     const float val = input_data[i];
433     const float upper = 6;
434     const float lower = 0;
435     const float clamped = val > upper ? upper : val < lower ? lower : val;
436     output_data[i] = clamped;
437   }
438 }
439 
440 template <typename T>
ReluX(const tflite::ActivationParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)441 inline void ReluX(const tflite::ActivationParams& params,
442                   const RuntimeShape& input_shape, const T* input_data,
443                   const RuntimeShape& output_shape, T* output_data) {
444   gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
445   const int flat_size = MatchingFlatSize(input_shape, output_shape);
446   const T max_value = params.quantized_activation_max;
447   const T min_value = params.quantized_activation_min;
448   for (int i = 0; i < flat_size; ++i) {
449     const T val = input_data[i];
450     const T clamped =
451         val > max_value ? max_value : val < min_value ? min_value : val;
452     output_data[i] = clamped;
453   }
454 }
455 
LeakyRelu(const tflite::LeakyReluParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)456 inline void LeakyRelu(const tflite::LeakyReluParams& params,
457                       const RuntimeShape& input_shape, const float* input_data,
458                       const RuntimeShape& output_shape, float* output_data) {
459   gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
460   const int flat_size = MatchingFlatSize(input_shape, output_shape);
461   for (int i = 0; i < flat_size; ++i) {
462     const float val = input_data[i];
463     // Note that this implementation matches that of TensorFlow, and corresponds
464     // to the traditional LeakyRelu equation only for alpha <= 1.
465     output_data[i] = std::max(val, val * params.alpha);
466   }
467 }
468 
L2Normalization(const tflite::L2NormalizationParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)469 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
470                             const RuntimeShape& input_shape,
471                             const float* input_data,
472                             const RuntimeShape& output_shape,
473                             float* output_data) {
474   const int trailing_dim = input_shape.DimensionsCount() - 1;
475   const int outer_size =
476       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
477   const int depth =
478       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
479   for (int i = 0; i < outer_size; ++i) {
480     float squared_l2_norm = 0;
481     for (int c = 0; c < depth; ++c) {
482       const float val = input_data[depth * i + c];
483       squared_l2_norm += val * val;
484     }
485     const float l2_norm = std::sqrt(squared_l2_norm);
486     for (int c = 0; c < depth; ++c) {
487       output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
488     }
489   }
490 }
491 
L2Normalization(const tflite::L2NormalizationParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)492 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
493                             const RuntimeShape& input_shape,
494                             const uint8* input_data,
495                             const RuntimeShape& output_shape,
496                             uint8* output_data) {
497   const int trailing_dim = input_shape.DimensionsCount() - 1;
498   const int depth =
499       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
500   const int outer_size =
501       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
502   const int32 input_zero_point = op_params.input_zero_point;
503   for (int i = 0; i < outer_size; ++i) {
504     int32 square_l2_norm = 0;
505     for (int c = 0; c < depth; c++) {
506       int32 diff = input_data[depth * i + c] - input_zero_point;
507       square_l2_norm += diff * diff;
508     }
509     int32 inv_l2norm_multiplier;
510     int inv_l2norm_shift;
511     GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
512                                      &inv_l2norm_multiplier, &inv_l2norm_shift);
513     for (int c = 0; c < depth; c++) {
514       int32 diff = input_data[depth * i + c] - input_zero_point;
515       int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
516           128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
517       int32 unclamped_output_val = 128 + rescaled_diff;
518       int32 output_val = std::min(255, std::max(0, unclamped_output_val));
519       output_data[depth * i + c] = static_cast<uint8>(output_val);
520     }
521   }
522 }
523 
524 template <typename T>
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)525 inline void Add(const ArithmeticParams& params,
526                 const RuntimeShape& input1_shape, const T* input1_data,
527                 const RuntimeShape& input2_shape, const T* input2_data,
528                 const RuntimeShape& output_shape, T* output_data) {
529   const int flat_size =
530       MatchingFlatSize(input1_shape, input2_shape, output_shape);
531   for (int i = 0; i < flat_size; ++i) {
532     output_data[i] = ActivationFunctionWithMinMax(
533         input1_data[i] + input2_data[i], params.quantized_activation_min,
534         params.quantized_activation_max);
535   }
536 }
537 
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)538 inline void Add(const ArithmeticParams& params,
539                 const RuntimeShape& input1_shape, const float* input1_data,
540                 const RuntimeShape& input2_shape, const float* input2_data,
541                 const RuntimeShape& output_shape, float* output_data) {
542   const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
543   for (int i = 0; i < size; i++) {
544     auto x = input1_data[i] + input2_data[i];
545     output_data[i] = ActivationFunctionWithMinMax(
546         x, params.float_activation_min, params.float_activation_max);
547   }
548 }
549 
550 // T is expected to be either float or int.
551 template <typename T>
AddN(const RuntimeShape & input_shape,const size_t num_inputs,T * const * input_data,T * output_data)552 inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
553                  T* const* input_data, T* output_data) {
554   // All inputs and output should have the same shape, this is checked during
555   // Prepare stage.
556   const size_t size = input_shape.FlatSize();
557   for (int i = 0; i < size; ++i) {
558     T x = 0;
559     for (int j = 0; j < num_inputs; ++j) {
560       x += input_data[j][i];
561     }
562     output_data[i] = x;
563   }
564 }
565 
566 // Element-wise add that can often be used for inner loop of broadcast add as
567 // well as the non-broadcast add.
AddElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)568 inline void AddElementwise(int size, const ArithmeticParams& params,
569                            const uint8* input1_data, const uint8* input2_data,
570                            uint8* output_data) {
571   TFLITE_DCHECK_GT(params.input1_offset, -256);
572   TFLITE_DCHECK_GT(params.input2_offset, -256);
573   TFLITE_DCHECK_LT(params.input1_offset, 256);
574   TFLITE_DCHECK_LT(params.input2_offset, 256);
575 
576   for (int i = 0; i < size; ++i) {
577     const int32 input1_val = params.input1_offset + input1_data[i];
578     const int32 input2_val = params.input2_offset + input2_data[i];
579     const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
580     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
581     const int32 scaled_input1_val =
582         MultiplyByQuantizedMultiplierSmallerThanOneExp(
583             shifted_input1_val, params.input1_multiplier, params.input1_shift);
584     const int32 scaled_input2_val =
585         MultiplyByQuantizedMultiplierSmallerThanOneExp(
586             shifted_input2_val, params.input2_multiplier, params.input2_shift);
587     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
588     const int32 raw_output =
589         MultiplyByQuantizedMultiplierSmallerThanOneExp(
590             raw_sum, params.output_multiplier, params.output_shift) +
591         params.output_offset;
592     const int32 clamped_output =
593         std::min(params.quantized_activation_max,
594                  std::max(params.quantized_activation_min, raw_output));
595     output_data[i] = static_cast<uint8>(clamped_output);
596   }
597 }
598 
599 // Scalar-broadcast add that can be used for inner loop of more general
600 // broadcast add, so that, for example, scalar-broadcast with batch will still
601 // be fast.
AddScalarBroadcast(int size,const ArithmeticParams & params,uint8 input1_data,const uint8 * input2_data,uint8 * output_data)602 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
603                                uint8 input1_data, const uint8* input2_data,
604                                uint8* output_data) {
605   TFLITE_DCHECK_GT(params.input1_offset, -256);
606   TFLITE_DCHECK_GT(params.input2_offset, -256);
607   TFLITE_DCHECK_LT(params.input1_offset, 256);
608   TFLITE_DCHECK_LT(params.input2_offset, 256);
609 
610   const int32 input1_val = params.input1_offset + input1_data;
611   const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
612   const int32 scaled_input1_val =
613       MultiplyByQuantizedMultiplierSmallerThanOneExp(
614           shifted_input1_val, params.input1_multiplier, params.input1_shift);
615   for (int i = 0; i < size; ++i) {
616     const int32 input2_val = params.input2_offset + input2_data[i];
617     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
618     const int32 scaled_input2_val =
619         MultiplyByQuantizedMultiplierSmallerThanOneExp(
620             shifted_input2_val, params.input2_multiplier, params.input2_shift);
621     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
622     const int32 raw_output =
623         MultiplyByQuantizedMultiplierSmallerThanOneExp(
624             raw_sum, params.output_multiplier, params.output_shift) +
625         params.output_offset;
626     const int32 clamped_output =
627         std::min(params.quantized_activation_max,
628                  std::max(params.quantized_activation_min, raw_output));
629     output_data[i] = static_cast<uint8>(clamped_output);
630   }
631 }
632 
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)633 inline void Add(const ArithmeticParams& params,
634                 const RuntimeShape& input1_shape, const uint8* input1_data,
635                 const RuntimeShape& input2_shape, const uint8* input2_data,
636                 const RuntimeShape& output_shape, uint8* output_data) {
637   TFLITE_DCHECK_LE(params.quantized_activation_min,
638                    params.quantized_activation_max);
639   const int flat_size =
640       MatchingFlatSize(input1_shape, input2_shape, output_shape);
641 
642   TFLITE_DCHECK_GT(params.input1_offset, -256);
643   TFLITE_DCHECK_GT(params.input2_offset, -256);
644   TFLITE_DCHECK_LT(params.input1_offset, 256);
645   TFLITE_DCHECK_LT(params.input2_offset, 256);
646   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
647 }
648 
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,int16 * output_data)649 inline void Add(const ArithmeticParams& params,
650                 const RuntimeShape& input1_shape, const int16* input1_data,
651                 const RuntimeShape& input2_shape, const int16* input2_data,
652                 const RuntimeShape& output_shape, int16* output_data) {
653   TFLITE_DCHECK_LE(params.quantized_activation_min,
654                    params.quantized_activation_max);
655 
656   const int input1_shift = params.input1_shift;
657   const int flat_size =
658       MatchingFlatSize(output_shape, input1_shape, input2_shape);
659   const int16 output_activation_min = params.quantized_activation_min;
660   const int16 output_activation_max = params.quantized_activation_max;
661 
662   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
663   TFLITE_DCHECK_LE(input1_shift, 0);
664   TFLITE_DCHECK_LE(params.input2_shift, 0);
665   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
666   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
667   const int input_right_shift =
668       input1_shift == 0 ? -params.input2_shift : -input1_shift;
669 
670   for (int i = 0; i < flat_size; i++) {
671     // F0 uses 0 integer bits, range [-1, 1].
672     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
673 
674     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
675     F0 scaled_input = F0::FromRaw(
676         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
677     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
678     const int16 raw_output = result.raw();
679     const int16 clamped_output = std::min(
680         output_activation_max, std::max(output_activation_min, raw_output));
681     output_data[i] = clamped_output;
682   }
683 }
684 
685 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
686 // dimensionality if the runtime code does a single loop over one dimension
687 // that handles broadcasting as the base case. The code generator would then
688 // generate max(D1, D2) nested for loops.
689 // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
690 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
691 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
692 // reference_ops.h.
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)693 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
694                                const RuntimeShape& input1_shape,
695                                const float* input1_data,
696                                const RuntimeShape& input2_shape,
697                                const float* input2_data,
698                                const RuntimeShape& output_shape,
699                                float* output_data) {
700   gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
701   NdArrayDesc<4> desc1;
702   NdArrayDesc<4> desc2;
703   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
704                                       &desc2);
705   const RuntimeShape extended_output_shape =
706       RuntimeShape::ExtendedShape(4, output_shape);
707 
708   // In Tensorflow, the dimensions are canonically named (batch_number, row,
709   // col, channel), with extents (batches, height, width, depth), with the
710   // trailing dimension changing most rapidly (channels has the smallest stride,
711   // typically 1 element).
712   //
713   // In generated C code, we store arrays with the dimensions reversed. The
714   // first dimension has smallest stride.
715   //
716   // We name our variables by their Tensorflow convention, but generate C code
717   // nesting loops such that the innermost loop has the smallest stride for the
718   // best cache behavior.
719   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
720     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
721       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
722         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
723           output_data[Offset(extended_output_shape, b, y, x, c)] =
724               ActivationFunctionWithMinMax(
725                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
726                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
727                   params.float_activation_min, params.float_activation_max);
728         }
729       }
730     }
731   }
732 }
733 
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)734 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
735                                const RuntimeShape& input1_shape,
736                                const int32* input1_data,
737                                const RuntimeShape& input2_shape,
738                                const int32* input2_data,
739                                const RuntimeShape& output_shape,
740                                int32* output_data) {
741   gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
742   NdArrayDesc<4> desc1;
743   NdArrayDesc<4> desc2;
744   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
745                                       &desc2);
746   const RuntimeShape extended_output_shape =
747       RuntimeShape::ExtendedShape(4, output_shape);
748 
749   // In Tensorflow, the dimensions are canonically named (batch_number, row,
750   // col, channel), with extents (batches, height, width, depth), with the
751   // trailing dimension changing most rapidly (channels has the smallest stride,
752   // typically 1 element).
753   //
754   // In generated C code, we store arrays with the dimensions reversed. The
755   // first dimension has smallest stride.
756   //
757   // We name our variables by their Tensorflow convention, but generate C code
758   // nesting loops such that the innermost loop has the smallest stride for the
759   // best cache behavior.
760   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
761     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
762       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
763         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
764           output_data[Offset(extended_output_shape, b, y, x, c)] =
765               ActivationFunctionWithMinMax(
766                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
767                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
768                   params.quantized_activation_min,
769                   params.quantized_activation_max);
770         }
771       }
772     }
773   }
774 }
775 
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)776 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
777                                const RuntimeShape& input1_shape,
778                                const uint8* input1_data,
779                                const RuntimeShape& input2_shape,
780                                const uint8* input2_data,
781                                const RuntimeShape& output_shape,
782                                uint8* output_data) {
783   gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
784   NdArrayDesc<4> desc1;
785   NdArrayDesc<4> desc2;
786   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
787                                       &desc2);
788   const RuntimeShape extended_output_shape =
789       RuntimeShape::ExtendedShape(4, output_shape);
790 
791   // In Tensorflow, the dimensions are canonically named (batch_number, row,
792   // col, channel), with extents (batches, height, width, depth), with the
793   // trailing dimension changing most rapidly (channels has the smallest stride,
794   // typically 1 element).
795   //
796   // In generated C code, we store arrays with the dimensions reversed. The
797   // first dimension has smallest stride.
798   //
799   // We name our variables by their Tensorflow convention, but generate C code
800   // nesting loops such that the innermost loop has the smallest stride for the
801   // best cache behavior.
802   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
803     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
804       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
805         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
806           const int32 input1_val =
807               params.input1_offset +
808               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
809           const int32 input2_val =
810               params.input2_offset +
811               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
812           const int32 shifted_input1_val =
813               input1_val * (1 << params.left_shift);
814           const int32 shifted_input2_val =
815               input2_val * (1 << params.left_shift);
816           const int32 scaled_input1_val =
817               MultiplyByQuantizedMultiplierSmallerThanOneExp(
818                   shifted_input1_val, params.input1_multiplier,
819                   params.input1_shift);
820           const int32 scaled_input2_val =
821               MultiplyByQuantizedMultiplierSmallerThanOneExp(
822                   shifted_input2_val, params.input2_multiplier,
823                   params.input2_shift);
824           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
825           const int32 raw_output =
826               MultiplyByQuantizedMultiplierSmallerThanOneExp(
827                   raw_sum, params.output_multiplier, params.output_shift) +
828               params.output_offset;
829           const int32 clamped_output =
830               std::min(params.quantized_activation_max,
831                        std::max(params.quantized_activation_min, raw_output));
832           output_data[Offset(extended_output_shape, b, y, x, c)] =
833               static_cast<uint8>(clamped_output);
834         }
835       }
836     }
837   }
838 }
839 
BroadcastAddFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8 * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8 * unswitched_input2_data,const RuntimeShape & output_shape,uint8 * output_data)840 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
841                                  const RuntimeShape& unswitched_input1_shape,
842                                  const uint8* unswitched_input1_data,
843                                  const RuntimeShape& unswitched_input2_shape,
844                                  const uint8* unswitched_input2_data,
845                                  const RuntimeShape& output_shape,
846                                  uint8* output_data) {
847   ArithmeticParams switched_params = unswitched_params;
848   switched_params.input1_offset = unswitched_params.input2_offset;
849   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
850   switched_params.input1_shift = unswitched_params.input2_shift;
851   switched_params.input2_offset = unswitched_params.input1_offset;
852   switched_params.input2_multiplier = unswitched_params.input1_multiplier;
853   switched_params.input2_shift = unswitched_params.input1_shift;
854 
855   const bool use_unswitched =
856       unswitched_params.broadcast_category ==
857       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
858 
859   const ArithmeticParams& params =
860       use_unswitched ? unswitched_params : switched_params;
861   const uint8* input1_data =
862       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
863   const uint8* input2_data =
864       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
865 
866   // Fivefold nested loops. The second input resets its position for each
867   // iteration of the second loop. The first input resets its position at the
868   // beginning of the fourth loop. The innermost loop is an elementwise add of
869   // sections of the arrays.
870   uint8* output_data_ptr = output_data;
871   const uint8* input1_data_ptr = input1_data;
872   const uint8* input2_data_reset = input2_data;
873   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
874   // between input shapes. y3 for input 1 is always broadcast, and so the
875   // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
876   // Put another way,
877   // input1.shape.FlatSize = y0 * y1 * y2 * y4,
878   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
879   int y0 = params.broadcast_shape[0];
880   int y1 = params.broadcast_shape[1];
881   int y2 = params.broadcast_shape[2];
882   int y3 = params.broadcast_shape[3];
883   int y4 = params.broadcast_shape[4];
884   if (y4 > 1) {
885     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
886     // dimension.
887     for (int i0 = 0; i0 < y0; ++i0) {
888       const uint8* input2_data_ptr;
889       for (int i1 = 0; i1 < y1; ++i1) {
890         input2_data_ptr = input2_data_reset;
891         for (int i2 = 0; i2 < y2; ++i2) {
892           for (int i3 = 0; i3 < y3; ++i3) {
893             AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
894                            output_data_ptr);
895             input2_data_ptr += y4;
896             output_data_ptr += y4;
897           }
898           // We have broadcast y4 of input1 data y3 times, and now move on.
899           input1_data_ptr += y4;
900         }
901       }
902       // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
903       input2_data_reset = input2_data_ptr;
904     }
905   } else {
906     // Special case of y4 == 1, in which the innermost loop is a single element
907     // and can be combined with the next (y3) as an inner broadcast.
908     //
909     // Note that this handles the case of pure scalar broadcast when
910     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
911     // broadcast with batch (as y2 > 1).
912     //
913     // NOTE The process is the same as the above general case except simplified
914     // for y4 == 1 and the loop over y3 is contained within the
915     // AddScalarBroadcast function.
916     for (int i0 = 0; i0 < y0; ++i0) {
917       const uint8* input2_data_ptr;
918       for (int i1 = 0; i1 < y1; ++i1) {
919         input2_data_ptr = input2_data_reset;
920         for (int i2 = 0; i2 < y2; ++i2) {
921           AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
922                              output_data_ptr);
923           input2_data_ptr += y3;
924           output_data_ptr += y3;
925           input1_data_ptr += 1;
926         }
927       }
928       input2_data_reset = input2_data_ptr;
929     }
930   }
931 }
932 
933 template <typename T>
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)934 inline void Mul(const ArithmeticParams& params,
935                 const RuntimeShape& input1_shape, const T* input1_data,
936                 const RuntimeShape& input2_shape, const T* input2_data,
937                 const RuntimeShape& output_shape, T* output_data) {
938   T output_activation_min;
939   T output_activation_max;
940   GetActivationParams(params, &output_activation_min, &output_activation_max);
941 
942   const int flat_size =
943       MatchingFlatSize(input1_shape, input2_shape, output_shape);
944   for (int i = 0; i < flat_size; ++i) {
945     output_data[i] = ActivationFunctionWithMinMax(
946         input1_data[i] * input2_data[i], output_activation_min,
947         output_activation_max);
948   }
949 }
950 
951 // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
952 // dimensionality if the runtime code does a single loop over one dimension
953 // that handles broadcasting as the base case. The code generator would then
954 // generate max(D1, D2) nested for loops.
955 // TODO(benoitjacob): BroadcastMul is intentionally duplicated from
956 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
957 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
958 // reference_ops.h.
959 template <typename T>
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)960 void BroadcastMul4DSlow(const ArithmeticParams& params,
961                         const RuntimeShape& unextended_input1_shape,
962                         const T* input1_data,
963                         const RuntimeShape& unextended_input2_shape,
964                         const T* input2_data,
965                         const RuntimeShape& unextended_output_shape,
966                         T* output_data) {
967   gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow");
968   T output_activation_min;
969   T output_activation_max;
970   GetActivationParams(params, &output_activation_min, &output_activation_max);
971 
972   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
973   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
974   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
975   const RuntimeShape output_shape =
976       RuntimeShape::ExtendedShape(4, unextended_output_shape);
977 
978   NdArrayDesc<4> desc1;
979   NdArrayDesc<4> desc2;
980   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
981                                       unextended_input2_shape, &desc1, &desc2);
982 
983   // In Tensorflow, the dimensions are canonically named (batch_number, row,
984   // col, channel), with extents (batches, height, width, depth), with the
985   // trailing dimension changing most rapidly (channels has the smallest stride,
986   // typically 1 element).
987   //
988   // In generated C code, we store arrays with the dimensions reversed. The
989   // first dimension has smallest stride.
990   //
991   // We name our variables by their Tensorflow convention, but generate C code
992   // nesting loops such that the innermost loop has the smallest stride for the
993   // best cache behavior.
994   for (int b = 0; b < output_shape.Dims(0); ++b) {
995     for (int y = 0; y < output_shape.Dims(1); ++y) {
996       for (int x = 0; x < output_shape.Dims(2); ++x) {
997         for (int c = 0; c < output_shape.Dims(3); ++c) {
998           output_data[Offset(output_shape, b, y, x, c)] =
999               ActivationFunctionWithMinMax(
1000                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
1001                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1002                   output_activation_min, output_activation_max);
1003         }
1004       }
1005     }
1006   }
1007 }
1008 
1009 // Element-wise mul that can often be used for inner loop of broadcast Mul as
1010 // well as the non-broadcast Mul.
MulElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)1011 inline void MulElementwise(int size, const ArithmeticParams& params,
1012                            const uint8* input1_data, const uint8* input2_data,
1013                            uint8* output_data) {
1014   for (int i = 0; i < size; ++i) {
1015     const int32 input1_val = params.input1_offset + input1_data[i];
1016     const int32 input2_val = params.input2_offset + input2_data[i];
1017     const int32 unclamped_result =
1018         params.output_offset +
1019         MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
1020                                                        params.output_multiplier,
1021                                                        params.output_shift);
1022     const int32 clamped_output =
1023         std::min(params.quantized_activation_max,
1024                  std::max(params.quantized_activation_min, unclamped_result));
1025     output_data[i] = static_cast<uint8>(clamped_output);
1026   }
1027 }
1028 
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1029 inline void Mul(const ArithmeticParams& params,
1030                 const RuntimeShape& input1_shape, const uint8* input1_data,
1031                 const RuntimeShape& input2_shape, const uint8* input2_data,
1032                 const RuntimeShape& output_shape, uint8* output_data) {
1033   TFLITE_DCHECK_LE(params.quantized_activation_min,
1034                    params.quantized_activation_max);
1035   gemmlowp::ScopedProfilingLabel label("Mul/8bit");
1036   const int flat_size =
1037       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1038 
1039   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
1040 }
1041 
BroadcastMulFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8 * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8 * unswitched_input2_data,const RuntimeShape & output_shape,uint8 * output_data)1042 inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
1043                                  const RuntimeShape& unswitched_input1_shape,
1044                                  const uint8* unswitched_input1_data,
1045                                  const RuntimeShape& unswitched_input2_shape,
1046                                  const uint8* unswitched_input2_data,
1047                                  const RuntimeShape& output_shape,
1048                                  uint8* output_data) {
1049   ArithmeticParams switched_params = unswitched_params;
1050   switched_params.input1_offset = unswitched_params.input2_offset;
1051   switched_params.input2_offset = unswitched_params.input1_offset;
1052 
1053   const bool use_unswitched =
1054       unswitched_params.broadcast_category ==
1055       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
1056 
1057   const ArithmeticParams& params =
1058       use_unswitched ? unswitched_params : switched_params;
1059   const uint8* input1_data =
1060       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
1061   const uint8* input2_data =
1062       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
1063 
1064   // Fivefold nested loops. The second input resets its position for each
1065   // iteration of the second loop. The first input resets its position at the
1066   // beginning of the fourth loop. The innermost loop is an elementwise Mul of
1067   // sections of the arrays.
1068   uint8* output_data_ptr = output_data;
1069   const uint8* input1_data_ptr = input1_data;
1070   const uint8* input2_data_reset = input2_data;
1071   int y0 = params.broadcast_shape[0];
1072   int y1 = params.broadcast_shape[1];
1073   int y2 = params.broadcast_shape[2];
1074   int y3 = params.broadcast_shape[3];
1075   int y4 = params.broadcast_shape[4];
1076   for (int i0 = 0; i0 < y0; ++i0) {
1077     const uint8* input2_data_ptr;
1078     for (int i1 = 0; i1 < y1; ++i1) {
1079       input2_data_ptr = input2_data_reset;
1080       for (int i2 = 0; i2 < y2; ++i2) {
1081         for (int i3 = 0; i3 < y3; ++i3) {
1082           MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
1083                          output_data_ptr);
1084           input2_data_ptr += y4;
1085           output_data_ptr += y4;
1086         }
1087         input1_data_ptr += y4;
1088       }
1089     }
1090     input2_data_reset = input2_data_ptr;
1091   }
1092 }
1093 
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1094 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
1095                                const RuntimeShape& input1_shape,
1096                                const uint8* input1_data,
1097                                const RuntimeShape& input2_shape,
1098                                const uint8* input2_data,
1099                                const RuntimeShape& output_shape,
1100                                uint8* output_data) {
1101   gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
1102 
1103   NdArrayDesc<4> desc1;
1104   NdArrayDesc<4> desc2;
1105   // The input shapes are extended as part of NdArrayDesc initialization.
1106   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1107                                       &desc2);
1108   const RuntimeShape extended_output_shape =
1109       RuntimeShape::ExtendedShape(4, output_shape);
1110 
1111   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1112     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1113       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1114         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1115           const int32 input1_val =
1116               params.input1_offset +
1117               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
1118           const int32 input2_val =
1119               params.input2_offset +
1120               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1121           const int32 unclamped_result =
1122               params.output_offset +
1123               MultiplyByQuantizedMultiplierSmallerThanOneExp(
1124                   input1_val * input2_val, params.output_multiplier,
1125                   params.output_shift);
1126           const int32 clamped_output = std::min(
1127               params.quantized_activation_max,
1128               std::max(params.quantized_activation_min, unclamped_result));
1129           output_data[Offset(extended_output_shape, b, y, x, c)] =
1130               static_cast<uint8>(clamped_output);
1131         }
1132       }
1133     }
1134   }
1135 }
1136 
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,int16 * output_data)1137 inline void Mul(const ArithmeticParams& params,
1138                 const RuntimeShape& input1_shape, const int16* input1_data,
1139                 const RuntimeShape& input2_shape, const int16* input2_data,
1140                 const RuntimeShape& output_shape, int16* output_data) {
1141   gemmlowp::ScopedProfilingLabel label("Mul/Int16");
1142 
1143   const int flat_size =
1144       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1145 
1146   for (int i = 0; i < flat_size; i++) {
1147     // F0 uses 0 integer bits, range [-1, 1].
1148     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1149 
1150     F0 unclamped_result =
1151         F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
1152     output_data[i] = unclamped_result.raw();
1153   }
1154 }
1155 
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1156 inline void Mul(const ArithmeticParams& params,
1157                 const RuntimeShape& input1_shape, const int16* input1_data,
1158                 const RuntimeShape& input2_shape, const int16* input2_data,
1159                 const RuntimeShape& output_shape, uint8* output_data) {
1160   gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
1161   int32 output_offset = params.output_offset;
1162   int32 output_activation_min = params.quantized_activation_min;
1163   int32 output_activation_max = params.quantized_activation_max;
1164   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1165 
1166   const int flat_size =
1167       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1168 
1169   for (int i = 0; i < flat_size; i++) {
1170     // F0 uses 0 integer bits, range [-1, 1].
1171     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1172 
1173     F0 unclamped_result =
1174         F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
1175     int16 rescaled_result =
1176         gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
1177     int16 clamped_result =
1178         std::min<int16>(output_activation_max - output_offset, rescaled_result);
1179     clamped_result =
1180         std::max<int16>(output_activation_min - output_offset, clamped_result);
1181     output_data[i] = output_offset + clamped_result;
1182   }
1183 }
1184 
1185 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
1186 // dimensionality if the runtime code does a single loop over one dimension
1187 // that handles broadcasting as the base case. The code generator would then
1188 // generate max(D1, D2) nested for loops.
1189 template <typename T>
BroadcastDiv4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)1190 void BroadcastDiv4DSlow(const ArithmeticParams& params,
1191                         const RuntimeShape& unextended_input1_shape,
1192                         const T* input1_data,
1193                         const RuntimeShape& unextended_input2_shape,
1194                         const T* input2_data,
1195                         const RuntimeShape& unextended_output_shape,
1196                         T* output_data) {
1197   T output_activation_min;
1198   T output_activation_max;
1199   GetActivationParams(params, &output_activation_min, &output_activation_max);
1200 
1201   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
1202   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
1203   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
1204   const RuntimeShape output_shape =
1205       RuntimeShape::ExtendedShape(4, unextended_output_shape);
1206 
1207   NdArrayDesc<4> desc1;
1208   NdArrayDesc<4> desc2;
1209   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
1210                                       unextended_input2_shape, &desc1, &desc2);
1211 
1212   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1213   // col, channel), with extents (batches, height, width, depth), with the
1214   // trailing dimension changing most rapidly (channels has the smallest
1215   // stride, typically 1 element).
1216   //
1217   // In generated C code, we store arrays with the dimensions reversed. The
1218   // first dimension has smallest stride.
1219   //
1220   // We name our variables by their Tensorflow convention, but generate C code
1221   // nesting loops such that the innermost loop has the smallest stride for
1222   // the best cache behavior.
1223   for (int b = 0; b < output_shape.Dims(0); ++b) {
1224     for (int y = 0; y < output_shape.Dims(1); ++y) {
1225       for (int x = 0; x < output_shape.Dims(2); ++x) {
1226         for (int c = 0; c < output_shape.Dims(3); ++c) {
1227           output_data[Offset(output_shape, b, y, x, c)] =
1228               ActivationFunctionWithMinMax(
1229                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] /
1230                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1231                   output_activation_min, output_activation_max);
1232         }
1233       }
1234     }
1235   }
1236 }
1237 
1238 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1239 inline void Div(const ArithmeticParams& params,
1240                 const RuntimeShape& input1_shape, const T* input1_data,
1241                 const RuntimeShape& input2_shape, const T* input2_data,
1242                 const RuntimeShape& output_shape, T* output_data) {
1243   T output_activation_min;
1244   T output_activation_max;
1245   GetActivationParams(params, &output_activation_min, &output_activation_max);
1246 
1247   const int flat_size =
1248       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1249   for (int i = 0; i < flat_size; ++i) {
1250     output_data[i] = ActivationFunctionWithMinMax(
1251         input1_data[i] / input2_data[i], output_activation_min,
1252         output_activation_max);
1253   }
1254 }
1255 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1256 inline void SubNonBroadcast(const ArithmeticParams& params,
1257                             const RuntimeShape& input1_shape,
1258                             const float* input1_data,
1259                             const RuntimeShape& input2_shape,
1260                             const float* input2_data,
1261                             const RuntimeShape& output_shape,
1262                             float* output_data) {
1263   const int flat_size =
1264       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1265   for (int i = 0; i < flat_size; ++i) {
1266     output_data[i] = ActivationFunctionWithMinMax(
1267         input1_data[i] - input2_data[i], params.float_activation_min,
1268         params.float_activation_max);
1269   }
1270 }
1271 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1272 inline void SubNonBroadcast(const ArithmeticParams& params,
1273                             const RuntimeShape& input1_shape,
1274                             const int32* input1_data,
1275                             const RuntimeShape& input2_shape,
1276                             const int32* input2_data,
1277                             const RuntimeShape& output_shape,
1278                             int32* output_data) {
1279   const int flat_size =
1280       MatchingFlatSize(input1_shape, input2_shape, output_shape);
1281   for (int i = 0; i < flat_size; ++i) {
1282     output_data[i] = ActivationFunctionWithMinMax(
1283         input1_data[i] - input2_data[i], params.quantized_activation_min,
1284         params.quantized_activation_max);
1285   }
1286 }
1287 
1288 // TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
1289 // dimensionality if the runtime code does a single loop over one dimension
1290 // that handles broadcasting as the base case. The code generator would then
1291 // generate max(D1, D2) nested for loops.
1292 // TODO(benoitjacob): BroadcastSub is intentionally duplicated from
1293 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
1294 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
1295 // reference_ops.h.
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1296 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1297                                const RuntimeShape& input1_shape,
1298                                const float* input1_data,
1299                                const RuntimeShape& input2_shape,
1300                                const float* input2_data,
1301                                const RuntimeShape& output_shape,
1302                                float* output_data) {
1303   gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/float");
1304   NdArrayDesc<4> desc1;
1305   NdArrayDesc<4> desc2;
1306   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1307                                       &desc2);
1308   const RuntimeShape extended_output_shape =
1309       RuntimeShape::ExtendedShape(4, output_shape);
1310 
1311   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1312   // col, channel), with extents (batches, height, width, depth), with the
1313   // trailing dimension changing most rapidly (channels has the smallest stride,
1314   // typically 1 element).
1315   //
1316   // In generated C code, we store arrays with the dimensions reversed. The
1317   // first dimension has smallest stride.
1318   //
1319   // We name our variables by their Tensorflow convention, but generate C code
1320   // nesting loops such that the innermost loop has the smallest stride for the
1321   // best cache behavior.
1322   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1323     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1324       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1325         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1326           output_data[Offset(extended_output_shape, b, y, x, c)] =
1327               ActivationFunctionWithMinMax(
1328                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1329                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1330                   params.float_activation_min, params.float_activation_max);
1331         }
1332       }
1333     }
1334   }
1335 }
1336 
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1337 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1338                                const RuntimeShape& input1_shape,
1339                                const uint8* input1_data,
1340                                const RuntimeShape& input2_shape,
1341                                const uint8* input2_data,
1342                                const RuntimeShape& output_shape,
1343                                uint8* output_data) {
1344   gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/uint8");
1345   NdArrayDesc<4> desc1;
1346   NdArrayDesc<4> desc2;
1347   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1348                                       &desc2);
1349   const RuntimeShape extended_output_shape =
1350       RuntimeShape::ExtendedShape(4, output_shape);
1351 
1352   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1353   // col, channel), with extents (batches, height, width, depth), with the
1354   // trailing dimension changing most rapidly (channels has the smallest stride,
1355   // typically 1 element).
1356   //
1357   // In generated C code, we store arrays with the dimensions reversed. The
1358   // first dimension has smallest stride.
1359   //
1360   // We name our variables by their Tensorflow convention, but generate C code
1361   // nesting loops such that the innermost loop has the smallest stride for the
1362   // best cache behavior.
1363   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1364     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1365       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1366         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1367           const int32 input1_val =
1368               params.input1_offset +
1369               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
1370           const int32 input2_val =
1371               params.input2_offset +
1372               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1373           const int32 shifted_input1_val =
1374               input1_val * (1 << params.left_shift);
1375           const int32 shifted_input2_val =
1376               input2_val * (1 << params.left_shift);
1377           const int32 scaled_input1_val =
1378               MultiplyByQuantizedMultiplierSmallerThanOneExp(
1379                   shifted_input1_val, params.input1_multiplier,
1380                   params.input1_shift);
1381           const int32 scaled_input2_val =
1382               MultiplyByQuantizedMultiplierSmallerThanOneExp(
1383                   shifted_input2_val, params.input2_multiplier,
1384                   params.input2_shift);
1385           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
1386           const int32 raw_output =
1387               MultiplyByQuantizedMultiplierSmallerThanOneExp(
1388                   raw_sub, params.output_multiplier, params.output_shift) +
1389               params.output_offset;
1390           const int32 clamped_output =
1391               std::min(params.quantized_activation_max,
1392                        std::max(params.quantized_activation_min, raw_output));
1393           output_data[Offset(extended_output_shape, b, y, x, c)] =
1394               static_cast<uint8>(clamped_output);
1395         }
1396       }
1397     }
1398   }
1399 }
1400 
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1401 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1402                                const RuntimeShape& input1_shape,
1403                                const int32* input1_data,
1404                                const RuntimeShape& input2_shape,
1405                                const int32* input2_data,
1406                                const RuntimeShape& output_shape,
1407                                int32* output_data) {
1408   gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/int32");
1409   NdArrayDesc<4> desc1;
1410   NdArrayDesc<4> desc2;
1411   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1412                                       &desc2);
1413   const RuntimeShape extended_output_shape =
1414       RuntimeShape::ExtendedShape(4, output_shape);
1415 
1416   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1417   // col, channel), with extents (batches, height, width, depth), with the
1418   // trailing dimension changing most rapidly (channels has the smallest stride,
1419   // typically 1 element).
1420   //
1421   // In generated C code, we store arrays with the dimensions reversed. The
1422   // first dimension has smallest stride.
1423   //
1424   // We name our variables by their Tensorflow convention, but generate C code
1425   // nesting loops such that the innermost loop has the smallest stride for the
1426   // best cache behavior.
1427   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1428     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1429       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1430         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1431           output_data[Offset(extended_output_shape, b, y, x, c)] =
1432               ActivationFunctionWithMinMax(
1433                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1434                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1435                   params.quantized_activation_min,
1436                   params.quantized_activation_max);
1437         }
1438       }
1439     }
1440   }
1441 }
1442 
1443 template <typename T>
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1444 void BroadcastSub4DSlow(const ArithmeticParams& params,
1445                         const RuntimeShape& input1_shape, const T* input1_data,
1446                         const RuntimeShape& input2_shape, const T* input2_data,
1447                         const RuntimeShape& output_shape, T* output_data) {
1448   gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/templated");
1449   NdArrayDesc<4> desc1;
1450   NdArrayDesc<4> desc2;
1451   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1452                                       &desc2);
1453   const RuntimeShape extended_output_shape =
1454       RuntimeShape::ExtendedShape(4, output_shape);
1455 
1456   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1457   // col, channel), with extents (batches, height, width, depth), with the
1458   // trailing dimension changing most rapidly (channels has the smallest stride,
1459   // typically 1 element).
1460   //
1461   // In generated C code, we store arrays with the dimensions reversed. The
1462   // first dimension has smallest stride.
1463   //
1464   // We name our variables by their Tensorflow convention, but generate C code
1465   // nesting loops such that the innermost loop has the smallest stride for the
1466   // best cache behavior.
1467   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1468     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1469       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1470         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1471           output_data[Offset(extended_output_shape, b, y, x, c)] =
1472               ActivationFunctionWithMinMax(
1473                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1474                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1475                   params.quantized_activation_min,
1476                   params.quantized_activation_max);
1477         }
1478       }
1479     }
1480   }
1481 }
1482 
1483 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1484 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
1485          const T* input1_data, const RuntimeShape& input2_shape,
1486          const T* input2_data, const RuntimeShape& output_shape,
1487          T* output_data) {
1488   NdArrayDesc<4> desc1;
1489   NdArrayDesc<4> desc2;
1490   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1491                                       &desc2);
1492   const RuntimeShape extended_output_shape =
1493       RuntimeShape::ExtendedShape(4, output_shape);
1494 
1495   // In Tensorflow, the dimensions are canonically named (batch_number, row,
1496   // col, channel), with extents (batches, height, width, depth), with the
1497   // trailing dimension changing most rapidly (channels has the smallest stride,
1498   // typically 1 element).
1499   //
1500   // In generated C code, we store arrays with the dimensions reversed. The
1501   // first dimension has smallest stride.
1502   //
1503   // We name our variables by their Tensorflow convention, but generate C code
1504   // nesting loops such that the innermost loop has the smallest stride for the
1505   // best cache behavior.
1506   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1507     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1508       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1509         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1510           output_data[Offset(extended_output_shape, b, y, x, c)] =
1511               input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1512               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1513         }
1514       }
1515     }
1516   }
1517 }
1518 
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1519 inline void SubWithActivation(const ArithmeticParams& params,
1520                               const RuntimeShape& input1_shape,
1521                               const int32* input1_data,
1522                               const RuntimeShape& input2_shape,
1523                               const int32* input2_data,
1524                               const RuntimeShape& output_shape,
1525                               int32* output_data) {
1526   gemmlowp::ScopedProfilingLabel label("SubWithActivation");
1527   const int flat_size =
1528       MatchingFlatSize(input1_shape, input2_shape, input2_shape);
1529   for (int i = 0; i < flat_size; ++i) {
1530     output_data[i] = ActivationFunctionWithMinMax(
1531         input1_data[i] - input2_data[i], params.quantized_activation_min,
1532         params.quantized_activation_max);
1533   }
1534 }
1535 
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1536 inline void SubWithActivation(const ArithmeticParams& params,
1537                               const RuntimeShape& input1_shape,
1538                               const float* input1_data,
1539                               const RuntimeShape& input2_shape,
1540                               const float* input2_data,
1541                               const RuntimeShape& output_shape,
1542                               float* output_data) {
1543   const int flat_size =
1544       MatchingFlatSize(input1_shape, input2_shape, input2_shape);
1545   for (int i = 0; i < flat_size; ++i) {
1546     output_data[i] = ActivationFunctionWithMinMax(
1547         input1_data[i] - input2_data[i], params.float_activation_min,
1548         params.float_activation_max);
1549   }
1550 }
1551 
Sub16(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)1552 inline void Sub16(const ArithmeticParams& params,
1553                   const RuntimeShape& input1_shape, const int16_t* input1_data,
1554                   const RuntimeShape& input2_shape, const int16_t* input2_data,
1555                   const RuntimeShape& output_shape, int16_t* output_data) {
1556   gemmlowp::ScopedProfilingLabel label("Sub/Int16");
1557   const int input1_shift = params.input1_shift;
1558   const int flat_size =
1559       MatchingFlatSize(output_shape, input1_shape, input2_shape);
1560   const int16 output_activation_min = params.quantized_activation_min;
1561   const int16 output_activation_max = params.quantized_activation_max;
1562 
1563   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
1564   TFLITE_DCHECK_LE(input1_shift, 0);
1565   TFLITE_DCHECK_LE(params.input2_shift, 0);
1566   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
1567   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
1568   const int input_right_shift =
1569       input1_shift == 0 ? -params.input2_shift : -input1_shift;
1570 
1571   if (input1_shift == 0) {
1572     // F0 uses 0 integer bits, range [-1, 1].
1573     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1574     for (int i = 0; i < flat_size; ++i) {
1575       F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
1576       F0 scaled_input = F0::FromRaw(
1577           gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
1578       F0 result = SaturatingSub(input_ready_scaled, scaled_input);
1579       const int16 raw_output = result.raw();
1580       const int16 clamped_output = std::min(
1581           output_activation_max, std::max(output_activation_min, raw_output));
1582       output_data[i] = clamped_output;
1583     }
1584   } else {
1585     // F0 uses 0 integer bits, range [-1, 1].
1586     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1587     for (int i = 0; i < flat_size; ++i) {
1588       F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
1589       F0 scaled_input = F0::FromRaw(
1590           gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
1591       F0 result = SaturatingSub(scaled_input, input_ready_scaled);
1592       const int16 raw_output = result.raw();
1593       const int16 clamped_output = std::min(
1594           output_activation_max, std::max(output_activation_min, raw_output));
1595       output_data[i] = clamped_output;
1596     }
1597   }
1598 }
1599 
1600 template <typename Scalar>
Concatenation(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1601 inline void Concatenation(const ConcatenationParams& params,
1602                           const RuntimeShape* const* input_shapes,
1603                           const Scalar* const* input_data,
1604                           const RuntimeShape& output_shape,
1605                           Scalar* output_data) {
1606   gemmlowp::ScopedProfilingLabel label("Concatenation");
1607   int axis = params.axis;
1608   int inputs_count = params.inputs_count;
1609   const int concat_dimensions = output_shape.DimensionsCount();
1610   TFLITE_DCHECK_LT(axis, concat_dimensions);
1611 
1612   int64_t concat_size = 0;
1613   for (int i = 0; i < inputs_count; i++) {
1614     TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
1615     for (int j = 0; j < concat_dimensions; j++) {
1616       if (j != axis) {
1617         MatchingDim(*input_shapes[i], j, output_shape, j);
1618       }
1619     }
1620     concat_size += input_shapes[i]->Dims(axis);
1621   }
1622   TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
1623   int64_t outer_size = 1;
1624   for (int i = 0; i < axis; ++i) {
1625     outer_size *= output_shape.Dims(i);
1626   }
1627   // For all input arrays,
1628   // FlatSize() = outer_size * Dims(axis) * base_inner_size;
1629   int64_t base_inner_size = 1;
1630   for (int i = axis + 1; i < concat_dimensions; ++i) {
1631     base_inner_size *= output_shape.Dims(i);
1632   }
1633 
1634   Scalar* output_ptr = output_data;
1635   for (int k = 0; k < outer_size; k++) {
1636     for (int i = 0; i < inputs_count; ++i) {
1637       const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
1638       memcpy(output_ptr, input_data[i] + k * copy_size,
1639              copy_size * sizeof(Scalar));
1640       output_ptr += copy_size;
1641     }
1642   }
1643 }
1644 
1645 // TODO(prabhumk): This is the same as the optimized implementation.
1646 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
1647 // quantized as it takes scale as a floating point value. This should be fixed
1648 // when optimizng this routine further.
ConcatenationWithScaling(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const uint8 * const * input_data,const RuntimeShape & output_shape,uint8 * output_data)1649 inline void ConcatenationWithScaling(const ConcatenationParams& params,
1650                                      const RuntimeShape* const* input_shapes,
1651                                      const uint8* const* input_data,
1652                                      const RuntimeShape& output_shape,
1653                                      uint8* output_data) {
1654   gemmlowp::ScopedProfilingLabel label("ConcatenationWithScaling/Uint8");
1655   int axis = params.axis;
1656   const int32* input_zeropoint = params.input_zeropoint;
1657   const float* input_scale = params.input_scale;
1658   int inputs_count = params.inputs_count;
1659   const int32 output_zeropoint = params.output_zeropoint;
1660   const float output_scale = params.output_scale;
1661 
1662   const int concat_dimensions = output_shape.DimensionsCount();
1663   TFLITE_DCHECK_LT(axis, concat_dimensions);
1664 
1665   int64_t concat_size = 0;
1666   for (int i = 0; i < inputs_count; i++) {
1667     TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
1668     for (int j = 0; j < concat_dimensions; j++) {
1669       if (j != axis) {
1670         MatchingDim(*input_shapes[i], j, output_shape, j);
1671       }
1672     }
1673     concat_size += input_shapes[i]->Dims(axis);
1674   }
1675   TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
1676   int64_t outer_size = 1;
1677   for (int i = 0; i < axis; ++i) {
1678     outer_size *= output_shape.Dims(i);
1679   }
1680   // For all input arrays,
1681   // FlatSize() = outer_size * Dims(axis) * base_inner_size;
1682   int64_t base_inner_size = 1;
1683   for (int i = axis + 1; i < concat_dimensions; ++i) {
1684     base_inner_size *= output_shape.Dims(i);
1685   }
1686 
1687   const float inverse_output_scale = 1.f / output_scale;
1688   uint8* output_ptr = output_data;
1689   for (int k = 0; k < outer_size; k++) {
1690     for (int i = 0; i < inputs_count; ++i) {
1691       const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
1692       const uint8* input_ptr = input_data[i] + k * copy_size;
1693       if (input_zeropoint[i] == output_zeropoint &&
1694           input_scale[i] == output_scale) {
1695         memcpy(output_ptr, input_ptr, copy_size);
1696       } else {
1697         const float scale = input_scale[i] * inverse_output_scale;
1698         const float bias = -input_zeropoint[i] * scale;
1699         for (int j = 0; j < copy_size; ++j) {
1700           const int32_t value =
1701               static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
1702               output_zeropoint;
1703           output_ptr[j] =
1704               static_cast<uint8_t>(std::max(std::min(255, value), 0));
1705         }
1706       }
1707       output_ptr += copy_size;
1708     }
1709   }
1710 }
1711 
1712 template <typename Scalar>
Pack(const PackParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1713 void Pack(const PackParams& params, const RuntimeShape* const* input_shapes,
1714           const Scalar* const* input_data, const RuntimeShape& output_shape,
1715           Scalar* output_data) {
1716   gemmlowp::ScopedProfilingLabel label("Pack");
1717   const int dimensions = output_shape.DimensionsCount();
1718   int axis = params.axis;
1719   int inputs_count = params.inputs_count;
1720 
1721   int outer_size = 1;
1722   for (int i = 0; i < axis; i++) {
1723     outer_size *= output_shape.Dims(i);
1724   }
1725   int copy_size = 1;
1726   for (int i = params.axis + 1; i < dimensions; i++) {
1727     copy_size *= output_shape.Dims(i);
1728   }
1729   TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
1730 
1731   for (int i = 0; i < inputs_count; ++i) {
1732     for (int k = 0; k < outer_size; k++) {
1733       const Scalar* input_ptr = input_data[i] + copy_size * k;
1734       int loc = k * inputs_count * copy_size + i * copy_size;
1735       memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
1736     }
1737   }
1738 }
1739 
1740 template <typename Scalar>
Unpack(const UnpackParams & params,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * const * output_datas)1741 void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
1742             const Scalar* input_data, const RuntimeShape& output_shape,
1743             Scalar* const* output_datas) {
1744   gemmlowp::ScopedProfilingLabel label("Unpack");
1745   const int dimensions = input_shape.DimensionsCount();
1746   const int outputs_count = params.num_split;
1747 
1748   int outer_size = 1;
1749   int axis = params.axis;
1750   if (axis < 0) {
1751     axis += dimensions;
1752   }
1753   TFLITE_DCHECK_GE(axis, 0);
1754   TFLITE_DCHECK_LT(axis, dimensions);
1755   for (int i = 0; i < axis; ++i) {
1756     outer_size *= input_shape.Dims(i);
1757   }
1758   int copy_size = 1;
1759   for (int i = axis + 1; i < dimensions; ++i) {
1760     copy_size *= input_shape.Dims(i);
1761   }
1762   TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
1763 
1764   for (int i = 0; i < outputs_count; ++i) {
1765     for (int k = 0; k < outer_size; k++) {
1766       Scalar* output_ptr = output_datas[i] + copy_size * k;
1767       int loc = k * outputs_count * copy_size + i * copy_size;
1768       memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
1769     }
1770   }
1771 }
1772 
1773 template <typename Scalar>
PackWithScaling(const PackParams & params,const RuntimeShape * const * input_shapes,const uint8 * const * input_data,const RuntimeShape & output_shape,uint8 * output_data)1774 void PackWithScaling(const PackParams& params,
1775                      const RuntimeShape* const* input_shapes,
1776                      const uint8* const* input_data,
1777                      const RuntimeShape& output_shape, uint8* output_data) {
1778   gemmlowp::ScopedProfilingLabel label("PackWithScaling");
1779   const int dimensions = output_shape.DimensionsCount();
1780   int axis = params.axis;
1781   const int32* input_zeropoint = params.input_zeropoint;
1782   const float* input_scale = params.input_scale;
1783   int inputs_count = params.inputs_count;
1784   const int32 output_zeropoint = params.output_zeropoint;
1785   const float output_scale = params.output_scale;
1786 
1787   int outer_size = 1;
1788   for (int i = 0; i < axis; i++) {
1789     outer_size *= output_shape.Dims(i);
1790   }
1791   int copy_size = 1;
1792   for (int i = axis + 1; i < dimensions; i++) {
1793     copy_size *= output_shape.Dims(i);
1794   }
1795   TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
1796 
1797   Scalar* output_ptr = output_data;
1798   const float inverse_output_scale = 1.f / output_scale;
1799   for (int k = 0; k < outer_size; k++) {
1800     for (int i = 0; i < inputs_count; ++i) {
1801       if (input_zeropoint[i] == output_zeropoint &&
1802           input_scale[i] == output_scale) {
1803         memcpy(output_ptr, input_data[i] + k * copy_size,
1804                copy_size * sizeof(Scalar));
1805       } else {
1806         assert(false);
1807         const float scale = input_scale[i] * inverse_output_scale;
1808         const float bias = -input_zeropoint[i] * scale;
1809         auto input_ptr = input_data[i];
1810         for (int j = 0; j < copy_size; ++j) {
1811           const int32_t value =
1812               static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
1813               output_zeropoint;
1814           output_ptr[j] =
1815               static_cast<uint8_t>(std::max(std::min(255, value), 0));
1816         }
1817       }
1818       output_ptr += copy_size;
1819     }
1820   }
1821 }
1822 
1823 template <typename Scalar>
DepthConcatenation(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1824 void DepthConcatenation(const ConcatenationParams& params,
1825                         const RuntimeShape* const* input_shapes,
1826                         const Scalar* const* input_data,
1827                         const RuntimeShape& output_shape, Scalar* output_data) {
1828   gemmlowp::ScopedProfilingLabel label("DepthConcatenation");
1829   auto params_copy = params;
1830   params_copy.axis = 3;
1831   Concatenation(params_copy, input_shapes, input_data, output_shape,
1832                 output_data);
1833 }
1834 
LstmCell(const LstmCellParams & params,const RuntimeShape & unextended_input_shape,const float * input_data,const RuntimeShape & unextended_prev_activ_shape,const float * prev_activ_data,const RuntimeShape & weights_shape,const float * weights_data,const RuntimeShape & unextended_bias_shape,const float * bias_data,const RuntimeShape & unextended_prev_state_shape,const float * prev_state_data,const RuntimeShape & unextended_output_state_shape,float * output_state_data,const RuntimeShape & unextended_output_activ_shape,float * output_activ_data,const RuntimeShape & unextended_concat_temp_shape,float * concat_temp_data,const RuntimeShape & unextended_activ_temp_shape,float * activ_temp_data)1835 inline void LstmCell(
1836     const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
1837     const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
1838     const float* prev_activ_data, const RuntimeShape& weights_shape,
1839     const float* weights_data, const RuntimeShape& unextended_bias_shape,
1840     const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
1841     const float* prev_state_data,
1842     const RuntimeShape& unextended_output_state_shape, float* output_state_data,
1843     const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
1844     const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
1845     const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
1846   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
1847   TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
1848   TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
1849   TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
1850   TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
1851   TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
1852   TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
1853   TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
1854   const RuntimeShape input_shape =
1855       RuntimeShape::ExtendedShape(4, unextended_input_shape);
1856   const RuntimeShape prev_activ_shape =
1857       RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
1858   const RuntimeShape bias_shape =
1859       RuntimeShape::ExtendedShape(4, unextended_bias_shape);
1860   const RuntimeShape prev_state_shape =
1861       RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
1862   const RuntimeShape output_state_shape =
1863       RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
1864   const RuntimeShape output_activ_shape =
1865       RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
1866   const RuntimeShape concat_temp_shape =
1867       RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
1868   const RuntimeShape activ_temp_shape =
1869       RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
1870   TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
1871 
1872   const int weights_dim_count = weights_shape.DimensionsCount();
1873   const int batches =
1874       MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
1875                   output_state_shape, 0, output_activ_shape, 0);
1876   const int height =
1877       MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
1878                   output_state_shape, 1, output_activ_shape, 1);
1879   const int width =
1880       MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
1881                   output_state_shape, 2, output_activ_shape, 2);
1882   const int input_depth = input_shape.Dims(3);
1883   const int prev_activ_depth = prev_activ_shape.Dims(3);
1884   const int total_input_depth = prev_activ_depth + input_depth;
1885   TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
1886                    total_input_depth);
1887   TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
1888   const int intern_activ_depth =
1889       MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
1890   TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
1891                    intern_activ_depth * total_input_depth);
1892   TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
1893   const int output_depth =
1894       MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
1895                   3, output_activ_shape, 3);
1896   TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
1897 
1898   // Concatenate prev_activ and input data together
1899   std::vector<float const*> concat_input_arrays_data;
1900   std::vector<RuntimeShape const*> concat_input_arrays_shapes;
1901   concat_input_arrays_data.push_back(input_data);
1902   concat_input_arrays_data.push_back(prev_activ_data);
1903   concat_input_arrays_shapes.push_back(&input_shape);
1904   concat_input_arrays_shapes.push_back(&prev_activ_shape);
1905   tflite::ConcatenationParams concat_params;
1906   concat_params.axis = 3;
1907   concat_params.inputs_count = concat_input_arrays_data.size();
1908   Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
1909                 &(concat_input_arrays_data[0]), concat_temp_shape,
1910                 concat_temp_data);
1911 
1912   // Fully connected
1913   tflite::FullyConnectedParams fc_params;
1914   fc_params.float_activation_min = std::numeric_limits<float>::lowest();
1915   fc_params.float_activation_max = std::numeric_limits<float>::max();
1916   FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
1917                  weights_data, bias_shape, bias_data, activ_temp_shape,
1918                  activ_temp_data);
1919 
1920   // Memory state update (the LSTM "guts")
1921   for (int b = 0; b < batches; ++b) {
1922     for (int w = 0; w < width; ++w) {
1923       for (int h = 0; h < height; ++h) {
1924         for (int c = 0; c < output_depth; ++c) {
1925           const float input_gate =
1926               1.f /
1927               (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1928                                                       0 * output_depth + c)]));
1929           const float new_input = std::tanh(activ_temp_data[Offset(
1930               activ_temp_shape, b, h, w, 1 * output_depth + c)]);
1931           const float forget_gate =
1932               1.f /
1933               (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1934                                                       2 * output_depth + c)]));
1935           const float output_gate =
1936               1.f /
1937               (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1938                                                       3 * output_depth + c)]));
1939           const float new_state =
1940               input_gate * new_input +
1941               forget_gate *
1942                   prev_state_data[Offset(prev_state_shape, b, h, w, c)];
1943           output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
1944           output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
1945               output_gate * std::tanh(new_state);
1946         }
1947       }
1948     }
1949   }
1950 }
1951 
1952 // Quantized LSTM cell implementation.
1953 // The quantization of the input, output arrays is as follows:
1954 //  - The input activations are quantized as uint8 on the interval
1955 //    [-1, 127/128].
1956 //    The rationale for that is that is the natural interval for output
1957 //    activations (see next point) and these need to be concatenated together.
1958 //    We could accommodate different ranges by re-scaling, but we empirically
1959 //    found that setting the input activations range to be [-1, 127/128] in the
1960 //    first place, removing the need for re-scaling, greatly improves accuracy.
1961 //  - The output activations are quantized as uint8 on the interval
1962 //    [-1, 127/128].
1963 //    The rationale for that is that the definition of a LSTM cell makes them
1964 //    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
1965 //    makes for simpler, more accurate fixed-point arithmetic.
1966 //  - The output-at-previous-timestep state array is obviously quantized as
1967 //    the output activations.
1968 //  - The internal LSTM memory (not the output-at-previous-timestep, the other
1969 //    internal state array) is int16-quantized and may use any power-of-two,
1970 //    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
1971 //    StateIntegerBits below, see the below discussion of that template
1972 //    parameter ("The StateIntegerBits template parameter").
1973 //  - The output of the internal fully-connected node is int16-quantized
1974 //    on the interval [-8, 8 * 32767/32768], the rationale for which is
1975 //    explained just below ("Why [-8, 8] for fully-connected output?").
1976 //
1977 //
1978 // === The StateIntegerBits template parameter ===
1979 //
1980 // The StateIntegerBits template parameter controls the fixed-point format used
1981 // to represent the internal memory of the LSTM cell (not the
1982 // output-at-previous-timestep, the other internal state array). It's currently
1983 // a template parameter so that the model can control that. The most typical
1984 // value for StateIntegerBits is 4. Other plausible values are anywhere between
1985 // 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
1986 // and drop that template parameter. The reason why it can't be a runtime
1987 // parameter is that this controls the fixed-point format used, i.e. we need to
1988 // generate actually different code based on it. In particular, we generate code
1989 // for a fixed-point tanh() implementation for that format, which internally
1990 // uses a fixed-point exp() implementation, which internally uses a
1991 // barrel-shifter with a number of steps that depends on StateIntegerBits.
1992 // Another consequence of that is that a higher value of StateIntegerBits
1993 // results in a more expensive implementation (more barrel shifter steps
1994 // needed).
1995 //
1996 //
1997 // === Why [-8, 8] for fully-connected output? ===
1998 //
1999 // This array is only fed to Logistic and Tanh functions, for which
2000 // the quantized implementation will want to use fixed-point arithmetic,
2001 // requiring a power-of-two representation interval. Thus, we should right
2002 // away quantize this array to a power-of-two interval; otherwise,
2003 // implementation will need to rescale that, losing any benefit that a tighter
2004 // representation interval might otherwise yield, while introducing some
2005 // numerical error and computational overhead.
2006 //
2007 // Now, Logistic and Tanh
2008 // are nearly constant (nearly equal to their horizontal asymptotes)
2009 // outside of a small bounded interval around 0:
2010 //
2011 //   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
2012 //   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
2013 //   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
2014 //
2015 // From this, we see that clamping to [-4, 4] would be too inaccurate
2016 // (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
2017 // while clamping to [-16, 16] would make no difference even in float32.
2018 // However, for a fixed-point implementation in 16-bit integers, using 5
2019 // integer bits to represent the [-16, 16] range would leave only 11
2020 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
2021 // representable values. Notice that is higher than the
2022 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
2023 // Using [-8, 8] thus seems like the better compromise overall, enjoying
2024 // an increment of 2.4e-4 between representable values and a worst-case
2025 // clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
2026 // [-16, 16].
2027 //
2028 // Moreover, all other things being equal, it is nice to choose the narrower
2029 // representation range, as that makes the implementation of fixed-point
2030 // math functions a little cheaper (each integer bit requires an additional
2031 // barrel-shifter atep in the implementation of exp(-x)). That is further
2032 // reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
2033 // sense for 32-bit float or 32-bit fixed-point quantization, but we are
2034 // aiming for 16-bit fixed-point quantization of these internal nodes here.
2035 //
2036 template <int StateIntegerBits>
LstmCell(const LstmCellParams & params,const RuntimeShape & unextended_input_shape,const uint8 * input_data_uint8,const RuntimeShape & unextended_prev_activ_shape,const uint8 * prev_activ_data_uint8,const RuntimeShape & weights_shape,const uint8 * weights_data_uint8,const RuntimeShape & unextended_bias_shape,const int32 * bias_data_int32,const RuntimeShape & unextended_prev_state_shape,const int16 * prev_state_data_int16,const RuntimeShape & unextended_output_state_shape,int16 * output_state_data_int16,const RuntimeShape & unextended_output_activ_shape,uint8 * output_activ_data_uint8,const RuntimeShape & unextended_concat_temp_shape,uint8 * concat_temp_data_uint8,const RuntimeShape & unextended_activ_temp_shape,int16 * activ_temp_data_int16,gemmlowp::GemmContext * gemm_context)2037 inline void LstmCell(
2038     const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
2039     const uint8* input_data_uint8,
2040     const RuntimeShape& unextended_prev_activ_shape,
2041     const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
2042     const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
2043     const int32* bias_data_int32,
2044     const RuntimeShape& unextended_prev_state_shape,
2045     const int16* prev_state_data_int16,
2046     const RuntimeShape& unextended_output_state_shape,
2047     int16* output_state_data_int16,
2048     const RuntimeShape& unextended_output_activ_shape,
2049     uint8* output_activ_data_uint8,
2050     const RuntimeShape& unextended_concat_temp_shape,
2051     uint8* concat_temp_data_uint8,
2052     const RuntimeShape& unextended_activ_temp_shape,
2053     int16* activ_temp_data_int16, gemmlowp::GemmContext* gemm_context) {
2054   (void)gemm_context;  // only used in optimized code.
2055   int32 weights_zero_point = params.weights_zero_point;
2056   int32 accum_multiplier = params.accum_multiplier;
2057   int accum_shift = params.accum_shift;
2058   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
2059   TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
2060   TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
2061   TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
2062   TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
2063   TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
2064   TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
2065   TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
2066   const RuntimeShape input_shape =
2067       RuntimeShape::ExtendedShape(4, unextended_input_shape);
2068   const RuntimeShape prev_activ_shape =
2069       RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
2070   const RuntimeShape bias_shape =
2071       RuntimeShape::ExtendedShape(4, unextended_bias_shape);
2072   const RuntimeShape prev_state_shape =
2073       RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
2074   const RuntimeShape output_state_shape =
2075       RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
2076   const RuntimeShape output_activ_shape =
2077       RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
2078   const RuntimeShape concat_temp_shape =
2079       RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
2080   const RuntimeShape activ_temp_shape =
2081       RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
2082   TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
2083 
2084   // Gather dimensions information, and perform consistency checks.
2085   const int weights_dim_count = weights_shape.DimensionsCount();
2086   const int outer_size = MatchingFlatSizeSkipDim(
2087       input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
2088       output_activ_shape);
2089   const int input_depth = input_shape.Dims(3);
2090   const int prev_activ_depth = prev_activ_shape.Dims(3);
2091   const int total_input_depth = prev_activ_depth + input_depth;
2092   TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
2093                    total_input_depth);
2094   const int intern_activ_depth =
2095       MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
2096   TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
2097                    intern_activ_depth * total_input_depth);
2098   TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
2099   TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
2100   const int output_depth =
2101       MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
2102                   3, output_activ_shape, 3);
2103   TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
2104   const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
2105   const int fc_output_depth =
2106       MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
2107   const int fc_accum_depth = total_input_depth;
2108   TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
2109 
2110   // Depth-concatenate prev_activ and input data together.
2111   uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
2112                                               prev_activ_data_uint8};
2113   const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
2114                                                        &prev_activ_shape};
2115   tflite::ConcatenationParams concat_params;
2116   concat_params.axis = 3;
2117   concat_params.inputs_count = 2;
2118   Concatenation(concat_params, concat_input_arrays_shapes,
2119                 concat_input_arrays_data, concat_temp_shape,
2120                 concat_temp_data_uint8);
2121 
2122   // Implementation of the fully connected node inside the LSTM cell.
2123   // The operands are 8-bit integers, the accumulators are internally 32bit
2124   // integers, and the output is 16-bit fixed-point with 3 integer bits so
2125   // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
2126   // is explained in the function comment above.
2127   for (int b = 0; b < fc_batches; ++b) {
2128     for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
2129       // Internal accumulation.
2130       // Initialize accumulator with the bias-value.
2131       int32 accum = bias_data_int32[out_c];
2132       // Accumulation loop.
2133       for (int d = 0; d < fc_accum_depth; ++d) {
2134         int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
2135         int16 weights_val =
2136             weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
2137         accum += input_val * weights_val;
2138       }
2139       // Down-scale the final int32 accumulator to the scale used by our
2140       // (16-bit, using 3 integer bits) fixed-point format. The quantized
2141       // multiplier and shift here have been pre-computed offline
2142       // (e.g. by toco).
2143       accum =
2144           MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
2145       // Saturate, cast to int16, and store to the temporary activations array.
2146       accum = std::max(-32768, std::min(32767, accum));
2147       activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
2148     }
2149   }
2150 
2151   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
2152   // and muls, all done in 16-bit fixed-point.
2153   for (int b = 0; b < outer_size; ++b) {
2154     for (int c = 0; c < output_depth; ++c) {
2155       // Define the fixed-point data types that we will use here. All use
2156       // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
2157       // They only differ by the number of integral vs. fractional bits,
2158       // determining the range of values that they can represent.
2159       //
2160       // F0 uses 0 integer bits, range [-1, 1].
2161       // This is the return type of math functions such as tanh, logistic,
2162       // whose range is in [-1, 1].
2163       using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2164       // F3 uses 3 integer bits, range [-8, 8].
2165       // This is the range of the previous fully-connected node's output,
2166       // which is our input here.
2167       using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2168       // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
2169       // 2^StateIntegerBits]. It's used to represent the internal state, whose
2170       // number of integer bits is currently dictated by the model. See comment
2171       // on the StateIntegerBits template parameter above.
2172       using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
2173       // Implementation of input gate, using fixed-point logistic function.
2174       F3 input_gate_input = F3::FromRaw(
2175           activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
2176       F0 input_gate_output = gemmlowp::logistic(input_gate_input);
2177       // Implementation of input modulation gate, using fixed-point tanh
2178       // function.
2179       F3 input_modulation_gate_input = F3::FromRaw(
2180           activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
2181       F0 input_modulation_gate_output =
2182           gemmlowp::tanh(input_modulation_gate_input);
2183       // Implementation of forget gate, using fixed-point logistic function.
2184       F3 forget_gate_input = F3::FromRaw(
2185           activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
2186       F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
2187       // Implementation of output gate, using fixed-point logistic function.
2188       F3 output_gate_input = F3::FromRaw(
2189           activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
2190       F0 output_gate_output = gemmlowp::logistic(output_gate_input);
2191       // Implementation of internal multiplication nodes, still in fixed-point.
2192       F0 input_times_input_modulation =
2193           input_gate_output * input_modulation_gate_output;
2194       FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
2195       FS prev_state_times_forget_state = forget_gate_output * prev_state;
2196       // Implementation of internal addition node, saturating.
2197       FS new_state = gemmlowp::SaturatingAdd(
2198           gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
2199           prev_state_times_forget_state);
2200       // Implementation of last internal Tanh node, still in fixed-point.
2201       // Since a Tanh fixed-point implementation is specialized for a given
2202       // number or integer bits, and each specialization can have a substantial
2203       // code size, and we already used above a Tanh on an input with 3 integer
2204       // bits, and per the table in the above function comment there is no
2205       // significant accuracy to be lost by clamping to [-8, +8] for a
2206       // 3-integer-bits representation, let us just do that. This helps people
2207       // porting this to targets where code footprint must be minimized.
2208       F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
2209       F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
2210       // Store the new internal state back to memory, as 16-bit integers.
2211       // Note: here we store the original value with StateIntegerBits, not
2212       // the rescaled 3-integer-bits value fed to tanh.
2213       output_state_data_int16[b * output_depth + c] = new_state.raw();
2214       // Down-scale the output activations to 8-bit integers, saturating,
2215       // and store back to memory.
2216       int16 rescaled_output_activ =
2217           gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
2218       int16 clamped_output_activ =
2219           std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
2220       output_activ_data_uint8[b * output_depth + c] =
2221           128 + clamped_output_activ;
2222     }
2223   }
2224 }
2225 
2226 template <typename Scalar>
Split(const SplitParams & params,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape * const * output_shapes,Scalar * const * output_data)2227 void Split(const SplitParams& params, const RuntimeShape& input_shape,
2228            const Scalar* input_data, const RuntimeShape* const* output_shapes,
2229            Scalar* const* output_data) {
2230   gemmlowp::ScopedProfilingLabel label("Split");
2231   const int concat_dimensions = input_shape.DimensionsCount();
2232   int axis = params.axis < 0 ? params.axis + concat_dimensions : params.axis;
2233   int outputs_count = params.num_split;
2234   TFLITE_DCHECK_LT(axis, concat_dimensions);
2235 
2236   int64_t concat_size = 0;
2237   for (int i = 0; i < outputs_count; i++) {
2238     TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), concat_dimensions);
2239     for (int j = 0; j < concat_dimensions; j++) {
2240       if (j != axis) {
2241         MatchingDim(*output_shapes[i], j, input_shape, j);
2242       }
2243     }
2244     concat_size += output_shapes[i]->Dims(axis);
2245   }
2246   TFLITE_DCHECK_EQ(concat_size, input_shape.Dims(axis));
2247   int64_t outer_size = 1;
2248   for (int i = 0; i < axis; ++i) {
2249     outer_size *= input_shape.Dims(i);
2250   }
2251   // For all output arrays,
2252   // FlatSize() = outer_size * Dims(axis) * base_inner_size;
2253   int64_t base_inner_size = 1;
2254   for (int i = axis + 1; i < concat_dimensions; ++i) {
2255     base_inner_size *= input_shape.Dims(i);
2256   }
2257 
2258   const Scalar* input_ptr = input_data;
2259   for (int k = 0; k < outer_size; k++) {
2260     for (int i = 0; i < outputs_count; ++i) {
2261       const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
2262       memcpy(output_data[i] + k * copy_size, input_ptr,
2263              copy_size * sizeof(Scalar));
2264       input_ptr += copy_size;
2265     }
2266   }
2267 }
2268 
NodeOffset(int b,int h,int w,int height,int width)2269 inline int NodeOffset(int b, int h, int w, int height, int width) {
2270   return (b * height + h) * width + w;
2271 }
2272 
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2273 inline void AveragePool(const PoolParams& params,
2274                         const RuntimeShape& input_shape,
2275                         const float* input_data,
2276                         const RuntimeShape& output_shape, float* output_data) {
2277   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2278   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2279   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2280   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2281   const int input_height = input_shape.Dims(1);
2282   const int input_width = input_shape.Dims(2);
2283   const int output_height = output_shape.Dims(1);
2284   const int output_width = output_shape.Dims(2);
2285   const int stride_height = params.stride_height;
2286   const int stride_width = params.stride_width;
2287   for (int batch = 0; batch < batches; ++batch) {
2288     for (int out_y = 0; out_y < output_height; ++out_y) {
2289       for (int out_x = 0; out_x < output_width; ++out_x) {
2290         for (int channel = 0; channel < depth; ++channel) {
2291           const int in_x_origin =
2292               (out_x * stride_width) - params.padding_values.width;
2293           const int in_y_origin =
2294               (out_y * stride_height) - params.padding_values.height;
2295           // Compute the boundaries of the filter region clamped so as to
2296           // ensure that the filter window fits in the input array.
2297           const int filter_x_start = std::max(0, -in_x_origin);
2298           const int filter_x_end =
2299               std::min(params.filter_width, input_width - in_x_origin);
2300           const int filter_y_start = std::max(0, -in_y_origin);
2301           const int filter_y_end =
2302               std::min(params.filter_height, input_height - in_y_origin);
2303           float total = 0.f;
2304           float filter_count = 0;
2305           for (int filter_y = filter_y_start; filter_y < filter_y_end;
2306                ++filter_y) {
2307             for (int filter_x = filter_x_start; filter_x < filter_x_end;
2308                  ++filter_x) {
2309               const int in_x = in_x_origin + filter_x;
2310               const int in_y = in_y_origin + filter_y;
2311               total +=
2312                   input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2313               filter_count++;
2314             }
2315           }
2316           const float average = total / filter_count;
2317           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2318               ActivationFunctionWithMinMax(average, params.float_activation_min,
2319                                            params.float_activation_max);
2320         }
2321       }
2322     }
2323   }
2324 }
2325 
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2326 inline void AveragePool(const PoolParams& params,
2327                         const RuntimeShape& input_shape,
2328                         const uint8* input_data,
2329                         const RuntimeShape& output_shape, uint8* output_data) {
2330   TFLITE_DCHECK_LE(params.quantized_activation_min,
2331                    params.quantized_activation_max);
2332   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2333   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2334   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2335   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2336   const int input_height = input_shape.Dims(1);
2337   const int input_width = input_shape.Dims(2);
2338   const int output_height = output_shape.Dims(1);
2339   const int output_width = output_shape.Dims(2);
2340   const int stride_height = params.stride_height;
2341   const int stride_width = params.stride_width;
2342   for (int batch = 0; batch < batches; ++batch) {
2343     for (int out_y = 0; out_y < output_height; ++out_y) {
2344       for (int out_x = 0; out_x < output_width; ++out_x) {
2345         for (int channel = 0; channel < depth; ++channel) {
2346           const int in_x_origin =
2347               (out_x * stride_width) - params.padding_values.width;
2348           const int in_y_origin =
2349               (out_y * stride_height) - params.padding_values.height;
2350           // Compute the boundaries of the filter region clamped so as to
2351           // ensure that the filter window fits in the input array.
2352           const int filter_x_start = std::max(0, -in_x_origin);
2353           const int filter_x_end =
2354               std::min(params.filter_width, input_width - in_x_origin);
2355           const int filter_y_start = std::max(0, -in_y_origin);
2356           const int filter_y_end =
2357               std::min(params.filter_height, input_height - in_y_origin);
2358           int32 acc = 0;
2359           int filter_count = 0;
2360           for (int filter_y = filter_y_start; filter_y < filter_y_end;
2361                ++filter_y) {
2362             for (int filter_x = filter_x_start; filter_x < filter_x_end;
2363                  ++filter_x) {
2364               const int in_x = in_x_origin + filter_x;
2365               const int in_y = in_y_origin + filter_y;
2366               acc +=
2367                   input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2368               filter_count++;
2369             }
2370           }
2371           acc = (acc + filter_count / 2) / filter_count;
2372           acc = std::max(acc, params.quantized_activation_min);
2373           acc = std::min(acc, params.quantized_activation_max);
2374           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2375               static_cast<uint8>(acc);
2376         }
2377       }
2378     }
2379   }
2380 }
2381 
L2Pool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2382 inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
2383                    const float* input_data, const RuntimeShape& output_shape,
2384                    float* output_data) {
2385   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2386   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2387   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2388   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2389   const int input_height = input_shape.Dims(1);
2390   const int input_width = input_shape.Dims(2);
2391   const int output_height = output_shape.Dims(1);
2392   const int output_width = output_shape.Dims(2);
2393   const int stride_height = params.stride_height;
2394   const int stride_width = params.stride_width;
2395   for (int batch = 0; batch < batches; ++batch) {
2396     for (int out_y = 0; out_y < output_height; ++out_y) {
2397       for (int out_x = 0; out_x < output_width; ++out_x) {
2398         for (int channel = 0; channel < depth; ++channel) {
2399           const int in_x_origin =
2400               (out_x * stride_width) - params.padding_values.width;
2401           const int in_y_origin =
2402               (out_y * stride_height) - params.padding_values.height;
2403           // Compute the boundaries of the filter region clamped so as to
2404           // ensure that the filter window fits in the input array.
2405           const int filter_x_start = std::max(0, -in_x_origin);
2406           const int filter_x_end =
2407               std::min(params.filter_width, input_width - in_x_origin);
2408           const int filter_y_start = std::max(0, -in_y_origin);
2409           const int filter_y_end =
2410               std::min(params.filter_height, input_height - in_y_origin);
2411           float sum_squares = 0.f;
2412           int filter_count = 0;
2413           for (int filter_y = filter_y_start; filter_y < filter_y_end;
2414                ++filter_y) {
2415             for (int filter_x = filter_x_start; filter_x < filter_x_end;
2416                  ++filter_x) {
2417               const int in_x = in_x_origin + filter_x;
2418               const int in_y = in_y_origin + filter_y;
2419               const float val =
2420                   input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2421               sum_squares += val * val;
2422               filter_count++;
2423             }
2424           }
2425           const float l2pool_result = std::sqrt(sum_squares / filter_count);
2426           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2427               ActivationFunctionWithMinMax(l2pool_result,
2428                                            params.float_activation_min,
2429                                            params.float_activation_max);
2430         }
2431       }
2432     }
2433   }
2434 }
2435 
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2436 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
2437                     const float* input_data, const RuntimeShape& output_shape,
2438                     float* output_data) {
2439   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2440   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2441   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2442   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2443   const int input_height = input_shape.Dims(1);
2444   const int input_width = input_shape.Dims(2);
2445   const int output_height = output_shape.Dims(1);
2446   const int output_width = output_shape.Dims(2);
2447   const int stride_height = params.stride_height;
2448   const int stride_width = params.stride_width;
2449   for (int batch = 0; batch < batches; ++batch) {
2450     for (int out_y = 0; out_y < output_height; ++out_y) {
2451       for (int out_x = 0; out_x < output_width; ++out_x) {
2452         for (int channel = 0; channel < depth; ++channel) {
2453           const int in_x_origin =
2454               (out_x * stride_width) - params.padding_values.width;
2455           const int in_y_origin =
2456               (out_y * stride_height) - params.padding_values.height;
2457           // Compute the boundaries of the filter region clamped so as to
2458           // ensure that the filter window fits in the input array.
2459           const int filter_x_start = std::max(0, -in_x_origin);
2460           const int filter_x_end =
2461               std::min(params.filter_width, input_width - in_x_origin);
2462           const int filter_y_start = std::max(0, -in_y_origin);
2463           const int filter_y_end =
2464               std::min(params.filter_height, input_height - in_y_origin);
2465           float max = std::numeric_limits<float>::lowest();
2466           for (int filter_y = filter_y_start; filter_y < filter_y_end;
2467                ++filter_y) {
2468             for (int filter_x = filter_x_start; filter_x < filter_x_end;
2469                  ++filter_x) {
2470               const int in_x = in_x_origin + filter_x;
2471               const int in_y = in_y_origin + filter_y;
2472               max = std::max(
2473                   max,
2474                   input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
2475             }
2476           }
2477           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2478               ActivationFunctionWithMinMax(max, params.float_activation_min,
2479                                            params.float_activation_max);
2480         }
2481       }
2482     }
2483   }
2484 }
2485 
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2486 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
2487                     const uint8* input_data, const RuntimeShape& output_shape,
2488                     uint8* output_data) {
2489   TFLITE_DCHECK_LE(params.quantized_activation_min,
2490                    params.quantized_activation_max);
2491   TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
2492   TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
2493   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2494   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2495   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2496   const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2497   const int input_height = input_shape.Dims(1);
2498   const int input_width = input_shape.Dims(2);
2499   const int output_height = output_shape.Dims(1);
2500   const int output_width = output_shape.Dims(2);
2501   const int stride_height = params.stride_height;
2502   const int stride_width = params.stride_width;
2503   for (int batch = 0; batch < batches; ++batch) {
2504     for (int out_y = 0; out_y < output_height; ++out_y) {
2505       for (int out_x = 0; out_x < output_width; ++out_x) {
2506         for (int channel = 0; channel < depth; ++channel) {
2507           const int in_x_origin =
2508               (out_x * stride_width) - params.padding_values.width;
2509           const int in_y_origin =
2510               (out_y * stride_height) - params.padding_values.height;
2511           // Compute the boundaries of the filter region clamped so as to
2512           // ensure that the filter window fits in the input array.
2513           const int filter_x_start = std::max(0, -in_x_origin);
2514           const int filter_x_end =
2515               std::min(params.filter_width, input_width - in_x_origin);
2516           const int filter_y_start = std::max(0, -in_y_origin);
2517           const int filter_y_end =
2518               std::min(params.filter_height, input_height - in_y_origin);
2519           uint8 max = 0;
2520           for (int filter_y = filter_y_start; filter_y < filter_y_end;
2521                ++filter_y) {
2522             for (int filter_x = filter_x_start; filter_x < filter_x_end;
2523                  ++filter_x) {
2524               const int in_x = in_x_origin + filter_x;
2525               const int in_y = in_y_origin + filter_y;
2526               max = std::max(
2527                   max,
2528                   input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
2529             }
2530           }
2531           max = std::max<uint8>(max, params.quantized_activation_min);
2532           max = std::min<uint8>(max, params.quantized_activation_max);
2533           output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2534               static_cast<uint8>(max);
2535         }
2536       }
2537     }
2538   }
2539 }
2540 
LocalResponseNormalization(const tflite::LocalResponseNormalizationParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2541 inline void LocalResponseNormalization(
2542     const tflite::LocalResponseNormalizationParams& op_params,
2543     const RuntimeShape& input_shape, const float* input_data,
2544     const RuntimeShape& output_shape, float* output_data) {
2545   const int trailing_dim = input_shape.DimensionsCount() - 1;
2546   const int outer_size =
2547       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2548   const int depth =
2549       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2550 
2551   for (int i = 0; i < outer_size; ++i) {
2552     for (int c = 0; c < depth; ++c) {
2553       const int begin_input_c = std::max(0, c - op_params.range);
2554       const int end_input_c = std::min(depth, c + op_params.range);
2555       float accum = 0.f;
2556       for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
2557         const float input_val = input_data[i * depth + input_c];
2558         accum += input_val * input_val;
2559       }
2560       const float multiplier =
2561           std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
2562       output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
2563     }
2564   }
2565 }
2566 
LogSoftmax(const SoftmaxParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2567 inline void LogSoftmax(const SoftmaxParams& params,
2568                        const RuntimeShape& input_shape, const float* input_data,
2569                        const RuntimeShape& output_shape, float* output_data) {
2570   const int trailing_dim = input_shape.DimensionsCount() - 1;
2571   const int outer_size =
2572       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2573   const int depth =
2574       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2575 
2576   for (int i = 0; i < outer_size; ++i) {
2577     // Find max element value which we'll use to ensure numerical stability
2578     // taking advantage of the following equality:
2579     // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
2580     float max = std::numeric_limits<float>::lowest();
2581     for (int c = 0; c < depth; ++c) {
2582       max = std::max(max, input_data[i * depth + c]);
2583     }
2584 
2585     // Compute sum.
2586     float sum = 0.f;
2587     for (int c = 0; c < depth; ++c) {
2588       sum += std::exp(input_data[i * depth + c] - max);
2589     }
2590 
2591     // Compute result.
2592     const float log_sum = std::log(sum);
2593     for (int c = 0; c < depth; ++c) {
2594       output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
2595     }
2596   }
2597 }
2598 
LogSoftmax(const SoftmaxParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2599 inline void LogSoftmax(const SoftmaxParams& params,
2600                        const RuntimeShape& input_shape, const uint8* input_data,
2601                        const RuntimeShape& output_shape, uint8* output_data) {
2602   gemmlowp::ScopedProfilingLabel label("LogSoftmax/8bit");
2603   const int32 input_multiplier = params.input_multiplier;
2604   const int32 input_left_shift = params.input_left_shift;
2605   const int32 reverse_scaling_divisor = params.reverse_scaling_divisor;
2606   const int32 reverse_scaling_right_shift = params.reverse_scaling_right_shift;
2607   const int diff_min = params.diff_min;
2608   // The representation chosen for the input to the exp() function is Q5.26.
2609   // We need to leave extra space since values that we skip might be as large
2610   // as -32 before multiplying by input_beta_multiplier, and therefore as
2611   // large as -16 afterwards.  Note that exp(-8) is definitely not
2612   // insignificant to accumulation, but exp(-16) definitely is.
2613   static constexpr int kScaledDiffIntegerBits = 5;
2614   static constexpr int kAccumulationIntegerBits = 12;
2615   static constexpr int kOutputIntegerBits = 4;
2616   using FixedPointScaledDiff =
2617       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
2618   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
2619 
2620   const int trailing_dim = input_shape.DimensionsCount() - 1;
2621   const int outer_size =
2622       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2623   const int depth =
2624       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2625 
2626   for (int i = 0; i < outer_size; ++i) {
2627     uint8 max_in_row = 0;
2628     for (int c = 0; c < depth; ++c) {
2629       max_in_row = std::max(max_in_row, input_data[i * depth + c]);
2630     }
2631 
2632     FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
2633     for (int c = 0; c < depth; ++c) {
2634       int32 input_diff =
2635           static_cast<int32>(input_data[i * depth + c]) - max_in_row;
2636       if (input_diff >= diff_min) {
2637         const int32 input_diff_rescaled =
2638             MultiplyByQuantizedMultiplierGreaterThanOne(
2639                 input_diff, input_multiplier, input_left_shift);
2640         const FixedPointScaledDiff scaled_diff_f8 =
2641             FixedPointScaledDiff::FromRaw(input_diff_rescaled);
2642         sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
2643                                         exp_on_negative_values(scaled_diff_f8));
2644       }
2645     }
2646 
2647     const int32 fixed_log_sum_of_exps =
2648         log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
2649             sum_of_exps)
2650             .raw();
2651 
2652     // rescaled_diff_min is smallest representable in
2653     // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
2654     // log-sub-exps that will be subtracted in the loop.
2655     //
2656     // The thresholds diff_min, etc are negative.
2657     const int rescaled_diff_min =
2658         fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
2659     const int adjusted_diff_min =
2660         std::max(diff_min - 1,  // Note use of > below instead of >= above.
2661                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
2662                      rescaled_diff_min, reverse_scaling_divisor,
2663                      -reverse_scaling_right_shift));
2664 
2665     for (int c = 0; c < depth; ++c) {
2666       int32 input_diff =
2667           static_cast<int32>(input_data[i * depth + c]) - max_in_row;
2668       if (input_diff > adjusted_diff_min) {
2669         const int32 input_diff_rescaled =
2670             MultiplyByQuantizedMultiplierGreaterThanOne(
2671                 input_diff, input_multiplier, input_left_shift);
2672         int32 unsat_output =
2673             gemmlowp::RoundingDivideByPOT(
2674                 (input_diff_rescaled - fixed_log_sum_of_exps),
2675                 31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
2676             255;
2677 
2678         output_data[i * depth + c] = static_cast<uint8>(
2679             std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
2680       } else {
2681         // Set output to smallest value.
2682         output_data[i * depth + c] = 0;
2683       }
2684     }
2685   }
2686 }
2687 
Logistic(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2688 inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
2689                      const RuntimeShape& output_shape, float* output_data) {
2690   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2691 
2692   for (int i = 0; i < flat_size; i++) {
2693     float val = input_data[i];
2694     float result = 1.f / (1.f + std::exp(-val));
2695     output_data[i] = result;
2696   }
2697 }
2698 
2699 // Convenience version that allows, for example, generated-code calls to be
2700 // uniform between data types.
Logistic(const LogisticParams &,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2701 inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
2702                      const float* input_data, const RuntimeShape& output_shape,
2703                      float* output_data) {
2704   // Drop params: not needed.
2705   Logistic(input_shape, input_data, output_shape, output_data);
2706 }
2707 
Logistic(const LogisticParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2708 inline void Logistic(const LogisticParams& params,
2709                      const RuntimeShape& input_shape, const uint8* input_data,
2710                      const RuntimeShape& output_shape, uint8* output_data) {
2711   const int32 input_zero_point = params.input_zero_point;
2712   const int32 input_range_radius = params.input_range_radius;
2713   const int32 input_multiplier = params.input_multiplier;
2714   const int input_left_shift = params.input_left_shift;
2715   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2716 
2717   for (int i = 0; i < flat_size; i++) {
2718     const uint8 input_val_u8 = input_data[i];
2719     const int32 input_val_centered =
2720         static_cast<int32>(input_val_u8) - input_zero_point;
2721     uint8 output_val;
2722     if (input_val_centered <= -input_range_radius) {
2723       output_val = 0;
2724     } else if (input_val_centered >= input_range_radius) {
2725       output_val = 255;
2726     } else {
2727       const int32 input_val_rescaled =
2728           MultiplyByQuantizedMultiplierGreaterThanOne(
2729               input_val_centered, input_multiplier, input_left_shift);
2730       using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
2731       using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
2732       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
2733       const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
2734       // Convert from Q0.31 to Q23.8.
2735       using gemmlowp::RoundingDivideByPOT;
2736       int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
2737       if (output_val_s32 == 256) {
2738         output_val_s32 = 255;
2739       }
2740       // Reinterpret as U0.8.
2741       TFLITE_DCHECK_GE(output_val_s32, 0);
2742       TFLITE_DCHECK_LE(output_val_s32, 255);
2743       output_val = static_cast<uint8>(output_val_s32);
2744     }
2745     output_data[i] = output_val;
2746   }
2747 }
2748 
Logistic(const LogisticParams & params,const RuntimeShape & input_shape,const int16 * input_data,const RuntimeShape & output_shape,int16 * output_data)2749 inline void Logistic(const LogisticParams& params,
2750                      const RuntimeShape& input_shape, const int16* input_data,
2751                      const RuntimeShape& output_shape, int16* output_data) {
2752   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2753 
2754   for (int i = 0; i < flat_size; i++) {
2755     // F0 uses 0 integer bits, range [-1, 1].
2756     // This is the return type of math functions such as tanh, logistic,
2757     // whose range is in [-1, 1].
2758     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2759     // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
2760     using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2761 
2762     const F3 input = F3::FromRaw(input_data[i]);
2763     F0 output = gemmlowp::logistic(input);
2764     output_data[i] = output.raw();
2765   }
2766 }
2767 
Tanh(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2768 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
2769                  const RuntimeShape& output_shape, float* output_data) {
2770   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2771 
2772   for (int i = 0; i < flat_size; i++) {
2773     float val = input_data[i];
2774     float result = std::tanh(val);
2775     output_data[i] = result;
2776   }
2777 }
2778 
2779 // Convenience version that allows, for example, generated-code calls to be
2780 // uniform between data types.
Tanh(const TanhParams &,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2781 inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
2782                  const float* input_data, const RuntimeShape& output_shape,
2783                  float* output_data) {
2784   // Drop params: not needed.
2785   Tanh(input_shape, input_data, output_shape, output_data);
2786 }
2787 
Tanh(const TanhParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2788 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
2789                  const uint8* input_data, const RuntimeShape& output_shape,
2790                  uint8* output_data) {
2791   const int32 input_zero_point = params.input_zero_point;
2792   const int32 input_range_radius = params.input_range_radius;
2793   const int32 input_multiplier = params.input_multiplier;
2794   const int input_left_shift = params.input_left_shift;
2795   const int32 output_zero_point = 128;
2796   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2797 
2798   for (int i = 0; i < flat_size; i++) {
2799     const uint8 input_val_u8 = input_data[i];
2800     const int32 input_val_centered =
2801         static_cast<int32>(input_val_u8) - input_zero_point;
2802     uint8 output_val;
2803     if (input_val_centered <= -input_range_radius) {
2804       output_val = 0;
2805     } else if (input_val_centered >= input_range_radius) {
2806       output_val = 255;
2807     } else {
2808       const int32 input_val_rescaled =
2809           MultiplyByQuantizedMultiplierGreaterThanOne(
2810               input_val_centered, input_multiplier, input_left_shift);
2811       using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
2812       using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
2813       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
2814       const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
2815       // Convert from Q0.31 to Q24.7.
2816       using gemmlowp::RoundingDivideByPOT;
2817       int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
2818       output_val_s32 += output_zero_point;
2819       if (output_val_s32 == 256) {
2820         output_val_s32 = 255;
2821       }
2822       // Reinterpret as Q0.7, encoded in uint8.
2823       TFLITE_DCHECK_GE(output_val_s32, 0);
2824       TFLITE_DCHECK_LE(output_val_s32, 255);
2825       output_val = static_cast<uint8>(output_val_s32);
2826     }
2827     output_data[i] = output_val;
2828   }
2829 }
2830 
Tanh(const TanhParams & params,const RuntimeShape & input_shape,const int16 * input_data,const RuntimeShape & output_shape,int16 * output_data)2831 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
2832                  const int16* input_data, const RuntimeShape& output_shape,
2833                  int16* output_data) {
2834   const int input_left_shift = params.input_left_shift;
2835   // Support for shifts is limited until we have a parameterized version of
2836   // SaturatingRoundingMultiplyByPOT().
2837   TFLITE_DCHECK_GE(input_left_shift, 0);
2838   TFLITE_DCHECK_LE(input_left_shift, 1);
2839 
2840   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2841 
2842   // F0 uses 0 integer bits, range [-1, 1].
2843   // This is the return type of math functions such as tanh, logistic,
2844   // whose range is in [-1, 1].
2845   using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2846   // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
2847   using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2848 
2849   if (input_left_shift == 0) {
2850     for (int i = 0; i < flat_size; i++) {
2851       F3 input = F3::FromRaw(input_data[i]);
2852       F0 output = gemmlowp::tanh(input);
2853       output_data[i] = output.raw();
2854     }
2855   } else {
2856     for (int i = 0; i < flat_size; i++) {
2857       F3 input = F3::FromRaw(
2858           gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
2859       F0 output = gemmlowp::tanh(input);
2860       output_data[i] = output.raw();
2861     }
2862   }
2863 }
2864 
Dequantize(const tflite::DequantizationParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,float * output_data)2865 inline void Dequantize(const tflite::DequantizationParams& op_params,
2866                        const RuntimeShape& input_shape, const uint8* input_data,
2867                        const RuntimeShape& output_shape, float* output_data) {
2868   gemmlowp::ScopedProfilingLabel label("Dequantize");
2869   int32 zero_point = op_params.zero_point;
2870   double scale = op_params.scale;
2871   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2872 
2873   for (int i = 0; i < flat_size; i++) {
2874     int32 val = input_data[i];
2875     float result = static_cast<float>(scale * (val - zero_point));
2876     output_data[i] = result;
2877   }
2878 }
2879 
FakeQuant(const tflite::FakeQuantParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2880 inline void FakeQuant(const tflite::FakeQuantParams& op_params,
2881                       const RuntimeShape& input_shape, const float* input_data,
2882                       const RuntimeShape& output_shape, float* output_data) {
2883   gemmlowp::ScopedProfilingLabel label("FakeQuant");
2884   float rmin = op_params.minmax.min;
2885   float rmax = op_params.minmax.max;
2886   int num_bits = op_params.num_bits;
2887   // 0 should always be a representable value. Let's assume that the initial
2888   // min,max range contains 0.
2889   TFLITE_DCHECK_LE(rmin, 0.0f);
2890   TFLITE_DCHECK_GE(rmax, 0.0f);
2891   TFLITE_DCHECK_LT(rmin, rmax);
2892 
2893   // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
2894   int quant_min = 0;
2895   int quant_max = (1 << num_bits) - 1;
2896   float nudged_min, nudged_max, nudged_scale;
2897   NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
2898                          &nudged_max, &nudged_scale);
2899   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2900   FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data,
2901                     output_data, flat_size);
2902 }
2903 
2904 template <typename SrcT, typename DstT>
Cast(const RuntimeShape & input_shape,const SrcT * input_data,const RuntimeShape & output_shape,DstT * output_data)2905 inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
2906                  const RuntimeShape& output_shape, DstT* output_data) {
2907   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2908 
2909   for (int i = 0; i < flat_size; i++) {
2910     int offset = i;
2911     output_data[offset] = static_cast<DstT>(input_data[offset]);
2912   }
2913 }
2914 
Floor(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2915 inline void Floor(const RuntimeShape& input_shape, const float* input_data,
2916                   const RuntimeShape& output_shape, float* output_data) {
2917   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2918 
2919   for (int i = 0; i < flat_size; i++) {
2920     int offset = i;
2921     output_data[offset] = std::floor(input_data[offset]);
2922   }
2923 }
2924 
Ceil(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2925 inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
2926                  const RuntimeShape& output_shape, float* output_data) {
2927   const int flat_size = MatchingFlatSize(input_shape, output_shape);
2928 
2929   for (int i = 0; i < flat_size; i++) {
2930     int offset = i;
2931     output_data[offset] = std::ceil(input_data[offset]);
2932   }
2933 }
2934 
2935 template <typename T, typename CoordsT = int32>
Gather(const tflite::GatherParams & op_params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & coords_shape,const CoordsT * coords_data,const RuntimeShape & output_shape,T * output_data)2936 inline void Gather(const tflite::GatherParams& op_params,
2937                    const RuntimeShape& input_shape, const T* input_data,
2938                    const RuntimeShape& coords_shape, const CoordsT* coords_data,
2939                    const RuntimeShape& output_shape, T* output_data) {
2940   gemmlowp::ScopedProfilingLabel label("Gather");
2941   int axis = op_params.axis;
2942   if (axis < 0) {
2943     axis += input_shape.DimensionsCount();
2944   }
2945   TFLITE_DCHECK_GE(axis, 0);
2946   TFLITE_DCHECK_LT(axis, input_shape.DimensionsCount());
2947   const int axis_size = input_shape.Dims(axis);
2948   const int coords_count = coords_shape.FlatSize();
2949 
2950   int outer_size = 1;
2951   for (int i = 0; i < axis; ++i) {
2952     outer_size *= input_shape.Dims(i);
2953   }
2954 
2955   int inner_size = 1;
2956   for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
2957     inner_size *= input_shape.Dims(i);
2958   }
2959 
2960   for (int outer = 0; outer < outer_size; ++outer) {
2961     for (int i = 0; i < coords_count; ++i) {
2962       TFLITE_DCHECK_GE(coords_data[i], 0);
2963       TFLITE_DCHECK_LT(coords_data[i], axis_size);
2964       std::memcpy(
2965           output_data + (outer * coords_count + i) * inner_size,
2966           input_data + (outer * axis_size + coords_data[i]) * inner_size,
2967           sizeof(T) * inner_size);
2968     }
2969   }
2970 }
2971 
2972 template <typename ParamsT, typename IndicesT = int32>
GatherNd(const RuntimeShape & params_shape,const ParamsT * params_data,const RuntimeShape & indices_shape,const IndicesT * indices_data,const RuntimeShape & output_shape,ParamsT * output_data)2973 inline void GatherNd(const RuntimeShape& params_shape,
2974                      const ParamsT* params_data,
2975                      const RuntimeShape& indices_shape,
2976                      const IndicesT* indices_data,
2977                      const RuntimeShape& output_shape, ParamsT* output_data) {
2978   gemmlowp::ScopedProfilingLabel label("GatherNd");
2979 
2980   int n_slices = 1;
2981   int slice_size = 1;
2982   const int indices_dims = indices_shape.DimensionsCount();
2983   const int indices_nd = indices_shape.Dims(indices_dims - 1);
2984   const int params_dims = params_shape.DimensionsCount();
2985   for (int i = 0; i < indices_dims - 1; ++i) {
2986     n_slices *= indices_shape.Dims(i);
2987   }
2988   for (int i = indices_nd; i < params_dims; ++i) {
2989     slice_size *= params_shape.Dims(i);
2990   }
2991 
2992   int remain_flat_size = params_shape.FlatSize();
2993   std::vector<int> dims_to_count(indices_nd, 0);
2994   for (int i = 0; i < indices_nd; ++i) {
2995     dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
2996     remain_flat_size = dims_to_count[i];
2997   }
2998 
2999   for (int i = 0; i < n_slices; ++i) {
3000     int from_pos = 0;
3001     for (int j = 0; j < indices_nd; ++j) {
3002       from_pos += indices_data[i * indices_nd + j] * dims_to_count[j];
3003     }
3004     std::memcpy(output_data + i * slice_size, params_data + from_pos,
3005                 sizeof(ParamsT) * slice_size);
3006   }
3007 }
3008 
3009 template <typename T>
ResizeBilinear(const tflite::ResizeBilinearParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_size_shape,const int32 * output_size_data,const RuntimeShape & unextended_output_shape,T * output_data)3010 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
3011                            const RuntimeShape& unextended_input_shape,
3012                            const T* input_data,
3013                            const RuntimeShape& unextended_output_size_shape,
3014                            const int32* output_size_data,
3015                            const RuntimeShape& unextended_output_shape,
3016                            T* output_data) {
3017   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3018   TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
3019   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3020   const RuntimeShape input_shape =
3021       RuntimeShape::ExtendedShape(4, unextended_input_shape);
3022   const RuntimeShape output_size_shape =
3023       RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
3024   const RuntimeShape output_shape =
3025       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3026 
3027   int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
3028   int32 input_height = input_shape.Dims(1);
3029   int32 input_width = input_shape.Dims(2);
3030   int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
3031 
3032   TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
3033   TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
3034   TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
3035   TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
3036   int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
3037   int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
3038 
3039   float height_scale = static_cast<float>(input_height) / output_height;
3040   float width_scale = static_cast<float>(input_width) / output_width;
3041   if (op_params.align_corners && output_height > 1) {
3042     height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
3043   }
3044   if (op_params.align_corners && output_width > 1) {
3045     width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
3046   }
3047 
3048   for (int b = 0; b < batches; ++b) {
3049     for (int y = 0; y < output_height; ++y) {
3050       float input_y = y * height_scale;
3051       int32 y0 = static_cast<int32>(std::floor(input_y));
3052       int32 y1 = std::min(y0 + 1, input_height - 1);
3053       for (int x = 0; x < output_width; ++x) {
3054         float input_x = x * width_scale;
3055         int32 x0 = static_cast<int32>(std::floor(input_x));
3056         int32 x1 = std::min(x0 + 1, input_width - 1);
3057         for (int c = 0; c < depth; ++c) {
3058           T interpolation =
3059               static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
3060                                  (1 - (input_y - y0)) * (1 - (input_x - x0)) +
3061                              input_data[Offset(input_shape, b, y1, x0, c)] *
3062                                  (input_y - y0) * (1 - (input_x - x0)) +
3063                              input_data[Offset(input_shape, b, y0, x1, c)] *
3064                                  (1 - (input_y - y0)) * (input_x - x0) +
3065                              input_data[Offset(input_shape, b, y1, x1, c)] *
3066                                  (input_y - y0) * (input_x - x0));
3067           output_data[Offset(output_shape, b, y, x, c)] = interpolation;
3068         }
3069       }
3070     }
3071   }
3072 }
3073 
3074 template <typename T>
SpaceToBatchND(const SpaceToBatchParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const int32 * block_shape_data,const RuntimeShape & unextended_input3_shape,const int32 * paddings_data,const RuntimeShape & unextended_output_shape,T * output_data)3075 inline void SpaceToBatchND(
3076     const SpaceToBatchParams& params,
3077     const RuntimeShape& unextended_input1_shape, const T* input1_data,
3078     const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
3079     const RuntimeShape& unextended_input3_shape, const int32* paddings_data,
3080     const RuntimeShape& unextended_output_shape, T* output_data) {
3081   gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
3082   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3083   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3084   const RuntimeShape input1_shape =
3085       RuntimeShape::ExtendedShape(4, unextended_input1_shape);
3086   const RuntimeShape output_shape =
3087       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3088 
3089   const int depth = input1_shape.Dims(3);
3090   const int input_width = input1_shape.Dims(2);
3091   const int input_height = input1_shape.Dims(1);
3092   const int input_batch_size = input1_shape.Dims(0);
3093 
3094   const int output_width = output_shape.Dims(2);
3095   const int output_height = output_shape.Dims(1);
3096   const int output_batch_size = output_shape.Dims(0);
3097 
3098   const int block_shape_height = block_shape_data[0];
3099   const int block_shape_width = block_shape_data[1];
3100   const int padding_top = paddings_data[0];
3101   const int padding_left = paddings_data[2];
3102 
3103   // For uint8 quantized, the correct padding "zero value" is the output offset.
3104   const int32_t pad_value = params.output_offset;
3105 
3106   for (int out_b = 0; out_b < output_batch_size; ++out_b) {
3107     int input_batch = out_b % input_batch_size;
3108     int shift_w = (out_b / input_batch_size) % block_shape_width;
3109     int shift_h = (out_b / input_batch_size) / block_shape_width;
3110     for (int out_h = 0; out_h < output_height; ++out_h) {
3111       for (int out_w = 0; out_w < output_width; ++out_w) {
3112         T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
3113         if (out_h * block_shape_height + shift_h < padding_top ||
3114             out_h * block_shape_height + shift_h >=
3115                 padding_top + input_height ||
3116             out_w * block_shape_width + shift_w < padding_left ||
3117             out_w * block_shape_width + shift_w >= padding_left + input_width) {
3118           // This may not execute correctly when pad_value != 0 and T != uint8.
3119           memset(out, pad_value, depth * sizeof(T));
3120         } else {
3121           const T* in =
3122               input1_data +
3123               Offset(input1_shape, input_batch,
3124                      (out_h * block_shape_height + shift_h) - padding_top,
3125                      (out_w * block_shape_width + shift_w) - padding_left, 0);
3126           memcpy(out, in, depth * sizeof(T));
3127         }
3128       }
3129     }
3130   }
3131 }
3132 
3133 template <typename T>
BatchToSpaceND(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const int32 * block_shape_data,const RuntimeShape & unextended_input3_shape,const int32 * crops_data,const RuntimeShape & unextended_output_shape,T * output_data)3134 inline void BatchToSpaceND(
3135     const RuntimeShape& unextended_input1_shape, const T* input1_data,
3136     const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
3137     const RuntimeShape& unextended_input3_shape, const int32* crops_data,
3138     const RuntimeShape& unextended_output_shape, T* output_data) {
3139   gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
3140   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3141   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3142   const RuntimeShape input1_shape =
3143       RuntimeShape::ExtendedShape(4, unextended_input1_shape);
3144   const RuntimeShape output_shape =
3145       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3146 
3147   const int output_width = output_shape.Dims(2);
3148   const int output_height = output_shape.Dims(1);
3149   const int output_batch_size = output_shape.Dims(0);
3150 
3151   const int depth = input1_shape.Dims(3);
3152   const int input_width = input1_shape.Dims(2);
3153   const int input_height = input1_shape.Dims(1);
3154   const int input_batch_size = input1_shape.Dims(0);
3155 
3156   const int block_shape_width = block_shape_data[1];
3157   const int block_shape_height = block_shape_data[0];
3158   const int crops_top = crops_data[0];
3159   const int crops_left = crops_data[2];
3160 
3161   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
3162     const int out_batch = in_batch % output_batch_size;
3163     const int spatial_offset = in_batch / output_batch_size;
3164     for (int in_h = 0; in_h < input_height; ++in_h) {
3165       const int out_h = in_h * block_shape_height +
3166                         spatial_offset / block_shape_width - crops_top;
3167       if (out_h < 0 || out_h >= output_height) {
3168         continue;
3169       }
3170       for (int in_w = 0; in_w < input_width; ++in_w) {
3171         const int out_w = in_w * block_shape_width +
3172                           spatial_offset % block_shape_width - crops_left;
3173 
3174         if (out_w < 0 || out_w >= output_width) {
3175           continue;
3176         }
3177         T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
3178         const T* in =
3179             input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
3180         memcpy(out, in, depth * sizeof(T));
3181       }
3182     }
3183   }
3184 }
3185 
3186 // There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
3187 // scalar input that provides the padding value.  Therefore pad_value_ptr can be
3188 // equivalent to a simple input1_data.  For Pad, it should point to a zero
3189 // value.
3190 //
3191 // Note that two typenames are required, so that T=P=int32 is considered a
3192 // specialization distinct from P=int32.
3193 template <typename T, typename P>
PadImpl(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3194 inline void PadImpl(const tflite::PadParams& op_params,
3195                     const RuntimeShape& input_shape, const T* input_data,
3196                     const P* pad_value_ptr, const RuntimeShape& output_shape,
3197                     T* output_data) {
3198   const RuntimeShape ext_input_shape =
3199       RuntimeShape::ExtendedShape(4, input_shape);
3200   const RuntimeShape ext_output_shape =
3201       RuntimeShape::ExtendedShape(4, output_shape);
3202   TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
3203   TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
3204 
3205   // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
3206   // we can pad them to 4 dims (yes, we are "padding the padding").
3207   std::vector<int> left_padding_copy(4, 0);
3208   for (int i = 0; i < op_params.left_padding_count; ++i) {
3209     left_padding_copy[i] = op_params.left_padding[i];
3210   }
3211   std::vector<int> right_padding_copy(4, 0);
3212   for (int i = 0; i < op_params.right_padding_count; ++i) {
3213     right_padding_copy[i] = op_params.right_padding[i];
3214   }
3215 
3216   const int output_batch = ext_output_shape.Dims(0);
3217   const int output_height = ext_output_shape.Dims(1);
3218   const int output_width = ext_output_shape.Dims(2);
3219   const int output_depth = ext_output_shape.Dims(3);
3220 
3221   const int left_b_padding = left_padding_copy[0];
3222   const int left_h_padding = left_padding_copy[1];
3223   const int left_w_padding = left_padding_copy[2];
3224   const int left_d_padding = left_padding_copy[3];
3225 
3226   const int right_b_padding = right_padding_copy[0];
3227   const int right_h_padding = right_padding_copy[1];
3228   const int right_w_padding = right_padding_copy[2];
3229   const int right_d_padding = right_padding_copy[3];
3230 
3231   const T pad_value = *pad_value_ptr;
3232 
3233   const T* in_ptr = input_data;
3234   T* out_ptr = output_data;
3235   for (int out_b = 0; out_b < output_batch; ++out_b) {
3236     for (int out_h = 0; out_h < output_height; ++out_h) {
3237       for (int out_w = 0; out_w < output_width; ++out_w) {
3238         for (int out_d = 0; out_d < output_depth; ++out_d) {
3239           if (out_b < left_b_padding ||
3240               out_b >= output_batch - right_b_padding ||
3241               out_h < left_h_padding ||
3242               out_h >= output_height - right_h_padding ||
3243               out_w < left_w_padding ||
3244               out_w >= output_width - right_w_padding ||
3245               out_d < left_d_padding ||
3246               out_d >= output_depth - right_d_padding) {
3247             *out_ptr++ = pad_value;
3248           } else {
3249             *out_ptr++ = *in_ptr++;
3250           }
3251         }
3252       }
3253     }
3254   }
3255 }
3256 
3257 template <typename T, typename P>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3258 inline void Pad(const tflite::PadParams& op_params,
3259                 const RuntimeShape& input_shape, const T* input_data,
3260                 const P* pad_value_ptr, const RuntimeShape& output_shape,
3261                 T* output_data) {
3262   PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3263           output_data);
3264 }
3265 
3266 // The second (pad-value) input can be int32 when, say, the first is uint8.
3267 template <typename T>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const int32 * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3268 inline void Pad(const tflite::PadParams& op_params,
3269                 const RuntimeShape& input_shape, const T* input_data,
3270                 const int32* pad_value_ptr, const RuntimeShape& output_shape,
3271                 T* output_data) {
3272   const T converted_pad_value = static_cast<T>(*pad_value_ptr);
3273   PadImpl(op_params, input_shape, input_data, &converted_pad_value,
3274           output_shape, output_data);
3275 }
3276 
3277 // This version avoids conflicting template matching.
3278 template <>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const int32 * input_data,const int32 * pad_value_ptr,const RuntimeShape & output_shape,int32 * output_data)3279 inline void Pad(const tflite::PadParams& op_params,
3280                 const RuntimeShape& input_shape, const int32* input_data,
3281                 const int32* pad_value_ptr, const RuntimeShape& output_shape,
3282                 int32* output_data) {
3283   PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3284           output_data);
3285 }
3286 
3287 // One could make all PadImageStyle calls simply delegate the work to the
3288 // ordinary Pad.  However, it is better that the reference code asserts false in
3289 // similar cases.
3290 template <typename T, typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3291 inline void PadImageStyle(const tflite::PadParams& op_params,
3292                           const RuntimeShape& input_shape, const T* input_data,
3293                           const P* pad_value_ptr,
3294                           const RuntimeShape& output_shape, T* output_data) {
3295   TFLITE_ASSERT_FALSE;
3296 }
3297 
3298 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,uint8 * output_data)3299 inline void PadImageStyle(const tflite::PadParams& op_params,
3300                           const RuntimeShape& input_shape,
3301                           const uint8* input_data, const P* pad_value_ptr,
3302                           const RuntimeShape& output_shape,
3303                           uint8* output_data) {
3304   Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3305       output_data);
3306 }
3307 
3308 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const int8_t * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,int8_t * output_data)3309 inline void PadImageStyle(const tflite::PadParams& op_params,
3310                           const RuntimeShape& input_shape,
3311                           const int8_t* input_data, const P* pad_value_ptr,
3312                           const RuntimeShape& output_shape,
3313                           int8_t* output_data) {
3314   Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3315       output_data);
3316 }
3317 
3318 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const float * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,float * output_data)3319 inline void PadImageStyle(const tflite::PadParams& op_params,
3320                           const RuntimeShape& input_shape,
3321                           const float* input_data, const P* pad_value_ptr,
3322                           const RuntimeShape& output_shape,
3323                           float* output_data) {
3324   Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3325       output_data);
3326 }
3327 template <typename T>
StridedSlice(const tflite::StridedSliceParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3328 inline void StridedSlice(const tflite::StridedSliceParams& op_params,
3329                          const RuntimeShape& unextended_input_shape,
3330                          const T* input_data,
3331                          const RuntimeShape& unextended_output_shape,
3332                          T* output_data) {
3333   // Note that the output_shape is not used herein.
3334   tflite::StridedSliceParams params_copy = op_params;
3335 
3336   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3337   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3338   const RuntimeShape input_shape =
3339       RuntimeShape::ExtendedShape(4, unextended_input_shape);
3340   const RuntimeShape output_shape =
3341       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3342 
3343   // Reverse and pad to 4 dimensions because that is what the runtime code
3344   // requires (ie. all shapes must be 4D and are given backwards).
3345   strided_slice::StridedSlicePadIndices(&params_copy, 4);
3346 
3347   const int start_b = strided_slice::StartForAxis(params_copy, input_shape, 0);
3348   const int stop_b =
3349       strided_slice::StopForAxis(params_copy, input_shape, 0, start_b);
3350   const int start_h = strided_slice::StartForAxis(params_copy, input_shape, 1);
3351   const int stop_h =
3352       strided_slice::StopForAxis(params_copy, input_shape, 1, start_h);
3353   const int start_w = strided_slice::StartForAxis(params_copy, input_shape, 2);
3354   const int stop_w =
3355       strided_slice::StopForAxis(params_copy, input_shape, 2, start_w);
3356   const int start_d = strided_slice::StartForAxis(params_copy, input_shape, 3);
3357   const int stop_d =
3358       strided_slice::StopForAxis(params_copy, input_shape, 3, start_d);
3359 
3360   T* out_ptr = output_data;
3361   for (int in_b = start_b;
3362        !strided_slice::LoopCondition(in_b, stop_b, params_copy.strides[0]);
3363        in_b += params_copy.strides[0]) {
3364     for (int in_h = start_h;
3365          !strided_slice::LoopCondition(in_h, stop_h, params_copy.strides[1]);
3366          in_h += params_copy.strides[1]) {
3367       for (int in_w = start_w;
3368            !strided_slice::LoopCondition(in_w, stop_w, params_copy.strides[2]);
3369            in_w += params_copy.strides[2]) {
3370         for (int in_d = start_d; !strided_slice::LoopCondition(
3371                  in_d, stop_d, params_copy.strides[3]);
3372              in_d += params_copy.strides[3]) {
3373           *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)];
3374         }
3375       }
3376     }
3377   }
3378 }
3379 
3380 template <typename T>
Slice(const tflite::SliceParams & op_params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)3381 inline void Slice(const tflite::SliceParams& op_params,
3382                   const RuntimeShape& input_shape, const T* input_data,
3383                   const RuntimeShape& output_shape, T* output_data) {
3384   const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
3385   // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
3386   TFLITE_DCHECK_LE(op_params.begin_count, 4);
3387   TFLITE_DCHECK_LE(op_params.size_count, 4);
3388   const int begin_count = op_params.begin_count;
3389   const int size_count = op_params.size_count;
3390   // We front-pad the begin and size vectors.
3391   const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
3392   const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
3393                          ? ext_shape.Dims(0) - start_b
3394                          : start_b + op_params.size[0];
3395   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
3396   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
3397                          ? ext_shape.Dims(1) - start_h
3398                          : start_h + op_params.size[size_count - 3];
3399   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
3400   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
3401                          ? ext_shape.Dims(2) - start_w
3402                          : start_w + op_params.size[size_count - 2];
3403   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
3404   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
3405                          ? ext_shape.Dims(3) - start_d
3406                          : start_d + op_params.size[size_count - 1];
3407 
3408   T* out_ptr = output_data;
3409   for (int in_b = start_b; in_b < stop_b; ++in_b) {
3410     for (int in_h = start_h; in_h < stop_h; ++in_h) {
3411       for (int in_w = start_w; in_w < stop_w; ++in_w) {
3412         for (int in_d = start_d; in_d < stop_d; ++in_d) {
3413           *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)];
3414         }
3415       }
3416     }
3417   }
3418 }
3419 
3420 template <typename T>
Exp(const T * input_data,const size_t num_elements,T * output_data)3421 inline void Exp(const T* input_data, const size_t num_elements,
3422                 T* output_data) {
3423   gemmlowp::ScopedProfilingLabel label("Exp");
3424   for (size_t idx = 0; idx < num_elements; ++idx) {
3425     output_data[idx] = exp(input_data[idx]);
3426   }
3427 }
3428 
3429 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
3430 // This method iterates through input data and reduce elements along the
3431 // dimensions given in axis.
3432 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (const Out current,const In in),Out * output_data)3433 inline bool Reduce(const In* input_data, const int* input_dims,
3434                    const int* output_dims, const int input_num_dims,
3435                    const int output_num_dims, const int* axis,
3436                    const int num_axis, int* input_iter,
3437                    Out reducer(const Out current, const In in),
3438                    Out* output_data) {
3439   // Reset input iterator.
3440   for (int idx = 0; idx < input_num_dims; ++idx) {
3441     input_iter[idx] = 0;
3442   }
3443   // Iterate through input_data.
3444   do {
3445     size_t input_offset =
3446         ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
3447     size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
3448                                                input_iter, num_axis, axis);
3449     output_data[output_offset] =
3450         reducer(output_data[output_offset], input_data[input_offset]);
3451   } while (NextIndex(input_num_dims, input_dims, input_iter));
3452   return true;
3453 }
3454 
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)3455 inline bool ResolveAxis(const int num_dims, const int* axis,
3456                         const int64_t num_axis, int* out_axis,
3457                         int* out_num_axis) {
3458   *out_num_axis = 0;  // Just in case.
3459   // Short-circuit axis resolution for scalars; the axis will go unused.
3460   if (num_dims == 0) {
3461     return true;
3462   }
3463   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
3464   for (int64_t idx = 0; idx < num_axis; ++idx) {
3465     // Handle negative index.
3466     int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
3467     TFLITE_DCHECK(current >= 0 && current < num_dims);
3468     bool is_dup = false;
3469     for (int j = 0; j < *out_num_axis; ++j) {
3470       if (out_axis[j] == current) {
3471         is_dup = true;
3472         break;
3473       }
3474     }
3475     if (!is_dup) {
3476       out_axis[*out_num_axis] = current;
3477       *out_num_axis += 1;
3478     }
3479   }
3480   return true;
3481 }
3482 
3483 // This method expects that output_data has been initialized.
3484 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)3485 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
3486                           const int* output_dims, const int input_num_dims,
3487                           const int output_num_dims, const int* axis,
3488                           const int num_axis, int* input_iter,
3489                           Out* output_data) {
3490   auto reducer = [](const Out current, const In in) -> Out {
3491     const Out actual_in = static_cast<Out>(in);
3492     return current + actual_in;
3493   };
3494   return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
3495                          output_num_dims, axis, num_axis, input_iter, reducer,
3496                          output_data);
3497 }
3498 
3499 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)3500 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
3501                                     const T init_value, T* data) {
3502   size_t num_elements = 1;
3503   for (int idx = 0; idx < num_dims; ++idx) {
3504     size_t current = static_cast<size_t>(dims[idx]);
3505     // Overflow prevention.
3506     if (num_elements > std::numeric_limits<size_t>::max() / current) {
3507       return false;
3508     }
3509     num_elements *= current;
3510   }
3511   for (size_t idx = 0; idx < num_elements; ++idx) {
3512     data[idx] = init_value;
3513   }
3514   return true;
3515 }
3516 
3517 // Computes the generic value (i.e., sum/max/min/prod) of elements across
3518 // dimensions given in axis. It needs to pass in init_value and reducer.
3519 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))3520 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
3521                           const int input_num_dims, T* output_data,
3522                           const int* output_dims, const int output_num_dims,
3523                           const int* axis, const int64_t num_axis_dimensions,
3524                           bool keep_dims, int* temp_index, int* resolved_axis,
3525                           T init_value,
3526                           T reducer(const T current, const T in)) {
3527   // Reset output data.
3528   if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
3529                                output_data)) {
3530     return false;
3531   }
3532 
3533   // Resolve axis.
3534   int num_resolved_axis = 0;
3535   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3536                    &num_resolved_axis)) {
3537     return false;
3538   }
3539 
3540   return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
3541                       output_num_dims, resolved_axis, num_resolved_axis,
3542                       temp_index, reducer, output_data);
3543 }
3544 
3545 // Computes the mean of elements across dimensions given in axis.
3546 // It does so in two stages, first calculates the sum of elements along the axis
3547 // then divides it by the number of element in axis.
3548 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)3549 inline bool Mean(const T* input_data, const int* input_dims,
3550                  const int input_num_dims, T* output_data,
3551                  const int* output_dims, const int output_num_dims,
3552                  const int* axis, const int num_axis_dimensions, bool keep_dims,
3553                  int* temp_index, int* resolved_axis, U* temp_sum) {
3554   gemmlowp::ScopedProfilingLabel label("Mean");
3555   // Reset output data.
3556   size_t num_outputs = 1;
3557   for (int idx = 0; idx < output_num_dims; ++idx) {
3558     size_t current = static_cast<size_t>(output_dims[idx]);
3559     // Overflow prevention.
3560     if (num_outputs > std::numeric_limits<size_t>::max() / current) {
3561       return false;
3562     }
3563     num_outputs *= current;
3564   }
3565   for (size_t idx = 0; idx < num_outputs; ++idx) {
3566     output_data[idx] = T();
3567     temp_sum[idx] = U();
3568   }
3569 
3570   // Resolve axis.
3571   int num_resolved_axis = 0;
3572   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3573                    &num_resolved_axis)) {
3574     return false;
3575   }
3576 
3577   if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
3578                            output_num_dims, resolved_axis, num_resolved_axis,
3579                            temp_index, temp_sum)) {
3580     return false;
3581   }
3582 
3583   // Calculate mean by dividing output_data by num of aggregated element.
3584   U num_elements_in_axis = 1;
3585   for (int idx = 0; idx < num_resolved_axis; ++idx) {
3586     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
3587     // Overflow prevention.
3588     if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
3589       return false;
3590     }
3591     num_elements_in_axis *= current;
3592   }
3593 
3594   if (num_elements_in_axis > 0) {
3595     for (size_t idx = 0; idx < num_outputs; ++idx) {
3596       output_data[idx] =
3597           static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
3598     }
3599   }
3600   return true;
3601 }
3602 
3603 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3604 inline void Mean(const tflite::MeanParams& op_params,
3605                  const RuntimeShape& unextended_input_shape,
3606                  const T* input_data,
3607                  const RuntimeShape& unextended_output_shape, T* output_data) {
3608   gemmlowp::ScopedProfilingLabel label("Mean4D");
3609 
3610   // Current implementation only supports dimension equals 4 and simultaneous
3611   // reduction over width and height.
3612   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
3613   TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3614   const RuntimeShape input_shape =
3615       RuntimeShape::ExtendedShape(4, unextended_input_shape);
3616   const RuntimeShape output_shape =
3617       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3618 
3619   const int output_batch = output_shape.Dims(0);
3620   const int output_height = output_shape.Dims(1);
3621   const int output_width = output_shape.Dims(2);
3622   const int output_depth = output_shape.Dims(3);
3623 
3624   const int input_height = input_shape.Dims(1);
3625   const int input_width = input_shape.Dims(2);
3626 
3627   TFLITE_DCHECK_EQ(op_params.axis_count, 2);
3628   TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
3629                 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
3630   TFLITE_DCHECK_EQ(output_height, 1);
3631   TFLITE_DCHECK_EQ(output_width, 1);
3632 
3633   for (int out_b = 0; out_b < output_batch; ++out_b) {
3634     for (int out_d = 0; out_d < output_depth; ++out_d) {
3635       float value = 0;
3636       for (int in_h = 0; in_h < input_height; ++in_h) {
3637         for (int in_w = 0; in_w < input_width; ++in_w) {
3638           value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
3639         }
3640       }
3641       output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3642           value / (input_width * input_height);
3643     }
3644   }
3645 }
3646 
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32 input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32 output_zero_point,float output_scale)3647 inline void Mean(const tflite::MeanParams& op_params,
3648                  const RuntimeShape& unextended_input_shape,
3649                  const uint8_t* input_data, int32 input_zero_point,
3650                  float input_scale, const RuntimeShape& unextended_output_shape,
3651                  uint8_t* output_data, int32 output_zero_point,
3652                  float output_scale) {
3653   gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
3654 
3655   // Current implementation only supports dimension equals 4 and simultaneous
3656   // reduction over width and height.
3657   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
3658   TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3659   const RuntimeShape input_shape =
3660       RuntimeShape::ExtendedShape(4, unextended_input_shape);
3661   const RuntimeShape output_shape =
3662       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3663   const int output_batch = output_shape.Dims(0);
3664   const int output_height = output_shape.Dims(1);
3665   const int output_width = output_shape.Dims(2);
3666   const int output_depth = output_shape.Dims(3);
3667   const int input_height = input_shape.Dims(1);
3668   const int input_width = input_shape.Dims(2);
3669   const float num_elements_in_axis = input_width * input_height;
3670 
3671   TFLITE_DCHECK_EQ(op_params.axis_count, 2);
3672   TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
3673                 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
3674   TFLITE_DCHECK_EQ(output_height, 1);
3675   TFLITE_DCHECK_EQ(output_width, 1);
3676 
3677   const bool ordinary_mean =
3678       (input_zero_point == output_zero_point && input_scale == output_scale);
3679   float scale, bias;
3680   if (!ordinary_mean) {
3681     scale = input_scale / output_scale;
3682     bias = -input_zero_point * scale + 0.5;
3683   }
3684   for (int out_b = 0; out_b < output_batch; ++out_b) {
3685     for (int out_d = 0; out_d < output_depth; ++out_d) {
3686       float temp_value = 0;
3687       for (int in_h = 0; in_h < input_height; ++in_h) {
3688         for (int in_w = 0; in_w < input_width; ++in_w) {
3689           temp_value +=
3690               input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
3691         }
3692       }
3693       temp_value = temp_value / num_elements_in_axis;
3694       if (ordinary_mean) {
3695         output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3696             static_cast<uint8_t>(round(temp_value));
3697       } else {
3698         output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3699             static_cast<uint8_t>(round(temp_value * scale + bias)) +
3700             output_zero_point;
3701       }
3702     }
3703   }
3704 }
3705 
3706 // Computes the mean of elements across dimensions given in axis.
3707 // It does so in two stages, first calculates the sum of elements along the axis
3708 // then divides it by the number of element in axis for quantized values.
3709 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32 input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32 output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)3710 inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
3711                                float input_scale, const int* input_dims,
3712                                const int input_num_dims, T* output_data,
3713                                int32 output_zero_point, float output_scale,
3714                                const int* output_dims,
3715                                const int output_num_dims, const int* axis,
3716                                const int num_axis_dimensions, bool keep_dims,
3717                                int* temp_index, int* resolved_axis, U* temp_sum,
3718                                bool compute_sum) {
3719   gemmlowp::ScopedProfilingLabel label(compute_sum ? "Sum/Uint8"
3720                                                    : "Mean/Uint8");
3721   // Reset output data.
3722   size_t num_outputs = 1;
3723   for (int idx = 0; idx < output_num_dims; ++idx) {
3724     size_t current = static_cast<size_t>(output_dims[idx]);
3725     // Overflow prevention.
3726     if (num_outputs > std::numeric_limits<size_t>::max() / current) {
3727       return false;
3728     }
3729     num_outputs *= current;
3730   }
3731   for (size_t idx = 0; idx < num_outputs; ++idx) {
3732     output_data[idx] = T();
3733     temp_sum[idx] = U();
3734   }
3735 
3736   // Resolve axis.
3737   int num_resolved_axis = 0;
3738   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3739                    &num_resolved_axis)) {
3740     return false;
3741   }
3742 
3743   if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
3744                            output_num_dims, resolved_axis, num_resolved_axis,
3745                            temp_index, temp_sum)) {
3746     return false;
3747   }
3748 
3749   // Calculate mean by dividing output_data by num of aggregated element.
3750   U num_elements_in_axis = 1;
3751   for (int idx = 0; idx < num_resolved_axis; ++idx) {
3752     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
3753     // Overflow prevention.
3754     if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
3755       return false;
3756     }
3757     num_elements_in_axis *= current;
3758   }
3759 
3760   if (num_elements_in_axis > 0) {
3761     const float scale = input_scale / output_scale;
3762     if (compute_sum) {
3763       // TODO(b/116341117): Eliminate float and do this completely in 8bit.
3764       const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5;
3765       for (size_t idx = 0; idx < num_outputs; ++idx) {
3766         const U value = static_cast<U>(round(temp_sum[idx] * scale + bias)) +
3767                         output_zero_point;
3768         output_data[idx] = static_cast<T>(value);
3769       }
3770     } else {
3771       const float bias = -input_zero_point * scale + 0.5;
3772       for (size_t idx = 0; idx < num_outputs; ++idx) {
3773         float float_mean = static_cast<float>(temp_sum[idx]) /
3774                            static_cast<float>(num_elements_in_axis);
3775 
3776         // Convert to float value.
3777         output_data[idx] = static_cast<T>(round(float_mean * scale + bias)) +
3778                            output_zero_point;
3779       }
3780     }
3781   }
3782   return true;
3783 }
3784 
3785 template <typename T>
Minimum(const RuntimeShape & input1_shape,const T * input1_data,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3786 void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
3787              const T* input2_data, const RuntimeShape& output_shape,
3788              T* output_data) {
3789   const int flat_size = MatchingFlatSize(input1_shape, output_shape);
3790 
3791   auto min_value = input2_data[0];
3792   for (int i = 0; i < flat_size; i++) {
3793     output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
3794   }
3795 }
3796 
3797 // Convenience version that allows, for example, generated-code calls to be
3798 // the same as other binary ops.
3799 template <typename T>
Minimum(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape &,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3800 inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
3801                     const RuntimeShape&, const T* input2_data,
3802                     const RuntimeShape& output_shape, T* output_data) {
3803   // Drop shape of second input: not needed.
3804   Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
3805 }
3806 
3807 template <typename T>
Maximum(const RuntimeShape & input1_shape,const T * input1_data,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3808 void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
3809              const T* input2_data, const RuntimeShape& output_shape,
3810              T* output_data) {
3811   const int flat_size = MatchingFlatSize(input1_shape, output_shape);
3812 
3813   auto max_value = input2_data[0];
3814   for (int i = 0; i < flat_size; i++) {
3815     output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
3816   }
3817 }
3818 
3819 // Convenience version that allows, for example, generated-code calls to be
3820 // the same as other binary ops.
3821 template <typename T>
Maximum(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape &,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3822 inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
3823                     const RuntimeShape&, const T* input2_data,
3824                     const RuntimeShape& output_shape, T* output_data) {
3825   // Drop shape of second input: not needed.
3826   Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
3827 }
3828 
3829 template <typename T, typename Op>
MaximumMinimumBroadcast4DSlow(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data,Op op)3830 void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
3831                                    const T* input1_data,
3832                                    const RuntimeShape& unextended_input2_shape,
3833                                    const T* input2_data,
3834                                    const RuntimeShape& unextended_output_shape,
3835                                    T* output_data, Op op) {
3836   gemmlowp::ScopedProfilingLabel label("MaximumMinimumBroadcast4DSlow");
3837   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3838   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
3839   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3840   const RuntimeShape output_shape =
3841       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3842 
3843   NdArrayDesc<4> desc1;
3844   NdArrayDesc<4> desc2;
3845   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
3846                                       unextended_input2_shape, &desc1, &desc2);
3847 
3848   for (int b = 0; b < output_shape.Dims(0); ++b) {
3849     for (int y = 0; y < output_shape.Dims(1); ++y) {
3850       for (int x = 0; x < output_shape.Dims(2); ++x) {
3851         for (int c = 0; c < output_shape.Dims(3); ++c) {
3852           auto out_idx = Offset(output_shape, b, y, x, c);
3853           auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
3854           auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
3855           auto in1_val = input1_data[in1_idx];
3856           auto in2_val = input2_data[in2_idx];
3857           output_data[out_idx] = op(in1_val, in2_val);
3858         }
3859       }
3860     }
3861   }
3862 }
3863 
3864 template <typename T1, typename T2, typename T3, typename Cmp>
ArgMinMax(const RuntimeShape & input1_shape,const T1 * input1_data,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data,const Cmp & cmp)3865 void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
3866                const T3* input2_data, const RuntimeShape& output_shape,
3867                T2* output_data, const Cmp& cmp) {
3868   gemmlowp::ScopedProfilingLabel label("ArgMinMax");
3869   TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
3870   TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
3871                    output_shape.DimensionsCount());
3872 
3873   int axis = input2_data[0];
3874   if (axis < 0) {
3875     axis += input1_shape.DimensionsCount();
3876   }
3877 
3878   const int axis_size = input1_shape.Dims(axis);
3879 
3880   int outer_size = 1;
3881   for (int i = 0; i < axis; ++i) {
3882     TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
3883     outer_size *= input1_shape.Dims(i);
3884   }
3885 
3886   int inner_size = 1;
3887   const int dims_count = input1_shape.DimensionsCount();
3888   for (int i = axis + 1; i < dims_count; ++i) {
3889     TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
3890     inner_size *= input1_shape.Dims(i);
3891   }
3892 
3893   for (int outer = 0; outer < outer_size; ++outer) {
3894     for (int inner = 0; inner < inner_size; ++inner) {
3895       auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
3896       int min_max_index = 0;
3897       for (int i = 1; i < axis_size; ++i) {
3898         const auto& curr_value =
3899             input1_data[(outer * axis_size + i) * inner_size + inner];
3900         if (cmp(curr_value, min_max_value)) {
3901           min_max_value = curr_value;
3902           min_max_index = i;
3903         }
3904       }
3905       output_data[outer * inner_size + inner] = min_max_index;
3906     }
3907   }
3908 }
3909 
3910 template <typename T1, typename T2, typename T3>
ArgMax(const RuntimeShape & input1_shape,const T1 * input1_data,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data)3911 void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
3912             const T3* input2_data, const RuntimeShape& output_shape,
3913             T2* output_data) {
3914   ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
3915             std::greater<T1>());
3916 }
3917 
3918 // Convenience version that allows, for example, generated-code calls to be
3919 // the same as other binary ops.
3920 template <typename T1, typename T2, typename T3>
ArgMax(const RuntimeShape & input1_shape,const T1 * input1_data,const RuntimeShape & input2_shape,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data)3921 inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
3922                    const RuntimeShape& input2_shape, const T3* input2_data,
3923                    const RuntimeShape& output_shape, T2* output_data) {
3924   // Drop shape of second input: not needed.
3925   ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
3926 }
3927 
3928 template <typename T>
Transpose(const TransposeParams & params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3929 void Transpose(const TransposeParams& params,
3930                const RuntimeShape& unextended_input_shape, const T* input_data,
3931                const RuntimeShape& unextended_output_shape, T* output_data) {
3932   const int unextended_output_size = unextended_output_shape.DimensionsCount();
3933   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3934   TFLITE_DCHECK_LE(unextended_output_size, 4);
3935   TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
3936   const RuntimeShape input_shape =
3937       RuntimeShape::ExtendedShape(4, unextended_input_shape);
3938   const RuntimeShape output_shape =
3939       RuntimeShape::ExtendedShape(4, unextended_output_shape);
3940   const int input_ext_size = 4 - unextended_input_shape.DimensionsCount();
3941   const int output_ext_size = 4 - unextended_output_size;
3942 
3943   // The perm data is extended to match the output, each index incremented by
3944   // the amount of front padding of the input shape.
3945   int extended_perm[4];
3946   for (int i = 0; i < output_ext_size; ++i) {
3947     extended_perm[i] = i;
3948   }
3949   for (int i = 0; i < unextended_output_size; ++i) {
3950     extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
3951   }
3952 
3953   int out_sizes[4];
3954   // Compute the inverse permutation array so we can do an output centered
3955   // transpose. Also, check to make sure output_dims is matching input_dims.
3956   for (int k = 0; k < 4; k++) {
3957     out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k);
3958   }
3959 
3960   // Naive transpose loop (iterate on output index and compute input index).
3961   int o[4];  // loop index (on output).
3962   int i[4];
3963   for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
3964     i[extended_perm[3]] = o[3];
3965     for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
3966       i[extended_perm[2]] = o[2];
3967       for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
3968         i[extended_perm[1]] = o[1];
3969         for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
3970           i[extended_perm[0]] = o[0];
3971           output_data[Offset(output_shape, o)] =
3972               input_data[Offset(input_shape, i)];
3973         }
3974       }
3975     }
3976   }
3977 }
3978 
TransposeConv(const ConvParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & filter_shape,const float * filter_data,const RuntimeShape & output_shape,float * output_data,const RuntimeShape & im2col_shape,float * im2col_data)3979 inline void TransposeConv(
3980     const ConvParams& params, const RuntimeShape& input_shape,
3981     const float* input_data, const RuntimeShape& filter_shape,
3982     const float* filter_data, const RuntimeShape& output_shape,
3983     float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
3984   const int stride_width = params.stride_width;
3985   const int stride_height = params.stride_height;
3986   const int pad_width = params.padding_values.width;
3987   const int pad_height = params.padding_values.height;
3988   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
3989   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
3990   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
3991   (void)im2col_data;   // only used in optimized code.
3992   (void)im2col_shape;  // only used in optimized code.
3993 
3994   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
3995   const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
3996   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
3997   const int input_height = input_shape.Dims(1);
3998   const int input_width = input_shape.Dims(2);
3999   const int filter_height = filter_shape.Dims(1);
4000   const int filter_width = filter_shape.Dims(2);
4001   const int output_height = output_shape.Dims(1);
4002   const int output_width = output_shape.Dims(2);
4003 
4004   // Although transpose convolution simplifies to convolution with transposed
4005   // weights for strides of 1, non-unitary striding complicates matters. To
4006   // keep this reference implementation as clear as possible, we use a
4007   // "scatter" access pattern, where we loop through all the input elements,
4008   // computing their influence on the output, rather than looping through the
4009   // output elements in the typical "gather" access pattern of a conv. We
4010   // therefore must initialize the output array to zero.
4011   const int num_elements = output_shape.FlatSize();
4012   for (int i = 0; i < num_elements; i++) {
4013     output_data[i] = 0.0f;
4014   }
4015 
4016   // Loop through input elements one at a time.
4017   for (int batch = 0; batch < batches; ++batch) {
4018     for (int in_y = 0; in_y < input_height; ++in_y) {
4019       for (int in_x = 0; in_x < input_width; ++in_x) {
4020         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
4021           // Loop through the output elements it will influence
4022           const int out_x_origin = (in_x * stride_width) - pad_width;
4023           const int out_y_origin = (in_y * stride_height) - pad_height;
4024           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
4025             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
4026               for (int out_channel = 0; out_channel < output_depth;
4027                    ++out_channel) {
4028                 // Compute output element location
4029                 const int out_x = out_x_origin + filter_x;
4030                 const int out_y = out_y_origin + filter_y;
4031                 // We cannot accumulate out of bounds
4032                 if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
4033                     (out_y < output_height)) {
4034                   float input_value = input_data[Offset(
4035                       input_shape, batch, in_y, in_x, in_channel)];
4036                   float filter_value =
4037                       filter_data[Offset(filter_shape, out_channel, filter_y,
4038                                          filter_x, in_channel)];
4039                   output_data[Offset(output_shape, batch, out_y, out_x,
4040                                      out_channel)] +=
4041                       input_value * filter_value;
4042                 }
4043               }
4044             }
4045           }
4046         }
4047       }
4048     }
4049   }
4050 }
4051 
4052 template <typename T>
EqualFn(T lhs,T rhs)4053 inline bool EqualFn(T lhs, T rhs) {
4054   return lhs == rhs;
4055 }
4056 
4057 template <typename T>
NotEqualFn(T lhs,T rhs)4058 inline bool NotEqualFn(T lhs, T rhs) {
4059   return lhs != rhs;
4060 }
4061 
4062 template <typename T>
GreaterFn(T lhs,T rhs)4063 inline bool GreaterFn(T lhs, T rhs) {
4064   return lhs > rhs;
4065 }
4066 template <typename T>
GreaterEqualFn(T lhs,T rhs)4067 inline bool GreaterEqualFn(T lhs, T rhs) {
4068   return lhs >= rhs;
4069 }
4070 template <typename T>
LessFn(T lhs,T rhs)4071 inline bool LessFn(T lhs, T rhs) {
4072   return lhs < rhs;
4073 }
4074 template <typename T>
LessEqualFn(T lhs,T rhs)4075 inline bool LessEqualFn(T lhs, T rhs) {
4076   return lhs <= rhs;
4077 }
4078 
4079 template <typename T>
4080 using ComparisonFn = bool (*)(T, T);
4081 
4082 template <typename T, ComparisonFn<T> F>
ComparisonImpl(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,bool * output_data)4083 inline void ComparisonImpl(
4084     const ComparisonParams& op_params, const RuntimeShape& input1_shape,
4085     const T* input1_data, const RuntimeShape& input2_shape,
4086     const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
4087   const int64_t flatsize =
4088       MatchingFlatSize(input1_shape, input2_shape, output_shape);
4089   for (int64_t i = 0; i < flatsize; ++i) {
4090     output_data[i] = F(input1_data[i], input2_data[i]);
4091   }
4092 }
4093 
4094 template <ComparisonFn<float> F>
Comparison(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,bool * output_data)4095 inline void Comparison(const ComparisonParams& op_params,
4096                        const RuntimeShape& input1_shape,
4097                        const float* input1_data,
4098                        const RuntimeShape& input2_shape,
4099                        const float* input2_data,
4100                        const RuntimeShape& output_shape, bool* output_data) {
4101   ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
4102                            input2_data, output_shape, output_data);
4103 }
4104 
4105 template <typename T, ComparisonFn<int32> F>
ComparisonWithScaling(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,bool * output_data)4106 inline void ComparisonWithScaling(
4107     const ComparisonParams& op_params, const RuntimeShape& input1_shape,
4108     const T* input1_data, const RuntimeShape& input2_shape,
4109     const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
4110   int left_shift = op_params.left_shift;
4111   int32 input1_offset = op_params.input1_offset;
4112   int32 input1_multiplier = op_params.input1_multiplier;
4113   int input1_shift = op_params.input1_shift;
4114   int32 input2_offset = op_params.input2_offset;
4115   int32 input2_multiplier = op_params.input2_multiplier;
4116   int input2_shift = op_params.input2_shift;
4117 
4118   const int64_t flatsize =
4119       MatchingFlatSize(input1_shape, input2_shape, output_shape);
4120   for (int64_t i = 0; i < flatsize; ++i) {
4121     const int32 input1_val = input1_offset + input1_data[i];
4122     const int32 input2_val = input2_offset + input2_data[i];
4123     const int32 shifted_input1_val = input1_val * (1 << left_shift);
4124     const int32 shifted_input2_val = input2_val * (1 << left_shift);
4125     const int32 scaled_input1_val =
4126         MultiplyByQuantizedMultiplierSmallerThanOneExp(
4127             shifted_input1_val, input1_multiplier, input1_shift);
4128     const int32 scaled_input2_val =
4129         MultiplyByQuantizedMultiplierSmallerThanOneExp(
4130             shifted_input2_val, input2_multiplier, input2_shift);
4131     output_data[i] = F(scaled_input1_val, scaled_input2_val);
4132   }
4133 }
4134 
4135 template <typename T, ComparisonFn<T> F>
BroadcastComparison4DSlowImpl(const ComparisonParams & op_params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data)4136 inline void BroadcastComparison4DSlowImpl(
4137     const ComparisonParams& op_params,
4138     const RuntimeShape& unextended_input1_shape, const T* input1_data,
4139     const RuntimeShape& unextended_input2_shape, const T* input2_data,
4140     const RuntimeShape& unextended_output_shape, bool* output_data) {
4141   gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlow");
4142   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4143   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4144   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4145   const RuntimeShape output_shape =
4146       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4147 
4148   NdArrayDesc<4> desc1;
4149   NdArrayDesc<4> desc2;
4150   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4151                                       unextended_input2_shape, &desc1, &desc2);
4152 
4153   for (int b = 0; b < output_shape.Dims(0); ++b) {
4154     for (int y = 0; y < output_shape.Dims(1); ++y) {
4155       for (int x = 0; x < output_shape.Dims(2); ++x) {
4156         for (int c = 0; c < output_shape.Dims(3); ++c) {
4157           output_data[Offset(output_shape, b, y, x, c)] =
4158               F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
4159                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
4160         }
4161       }
4162     }
4163   }
4164 }
4165 template <ComparisonFn<float> F>
BroadcastComparison4DSlow(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,bool * output_data)4166 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
4167                                       const RuntimeShape& input1_shape,
4168                                       const float* input1_data,
4169                                       const RuntimeShape& input2_shape,
4170                                       const float* input2_data,
4171                                       const RuntimeShape& output_shape,
4172                                       bool* output_data) {
4173   BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
4174                                           input2_shape, input2_data,
4175                                           output_shape, output_data);
4176 }
4177 
4178 template <typename T, ComparisonFn<int32> F>
BroadcastComparison4DSlowWithScaling(const ComparisonParams & op_params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data)4179 inline void BroadcastComparison4DSlowWithScaling(
4180     const ComparisonParams& op_params,
4181     const RuntimeShape& unextended_input1_shape, const T* input1_data,
4182     const RuntimeShape& unextended_input2_shape, const T* input2_data,
4183     const RuntimeShape& unextended_output_shape, bool* output_data) {
4184   gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlowWithScaling");
4185   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4186   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4187   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4188   const RuntimeShape output_shape =
4189       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4190 
4191   NdArrayDesc<4> desc1;
4192   NdArrayDesc<4> desc2;
4193   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4194                                       unextended_input2_shape, &desc1, &desc2);
4195 
4196   int left_shift = op_params.left_shift;
4197   int32 input1_offset = op_params.input1_offset;
4198   int32 input1_multiplier = op_params.input1_multiplier;
4199   int input1_shift = op_params.input1_shift;
4200   int32 input2_offset = op_params.input2_offset;
4201   int32 input2_multiplier = op_params.input2_multiplier;
4202   int input2_shift = op_params.input2_shift;
4203 
4204   for (int b = 0; b < output_shape.Dims(0); ++b) {
4205     for (int y = 0; y < output_shape.Dims(1); ++y) {
4206       for (int x = 0; x < output_shape.Dims(2); ++x) {
4207         for (int c = 0; c < output_shape.Dims(3); ++c) {
4208           const int32 input1_val =
4209               input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
4210           const int32 input2_val =
4211               input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
4212           const int32 shifted_input1_val = input1_val * (1 << left_shift);
4213           const int32 shifted_input2_val = input2_val * (1 << left_shift);
4214           const int32 scaled_input1_val =
4215               MultiplyByQuantizedMultiplierSmallerThanOneExp(
4216                   shifted_input1_val, input1_multiplier, input1_shift);
4217           const int32 scaled_input2_val =
4218               MultiplyByQuantizedMultiplierSmallerThanOneExp(
4219                   shifted_input2_val, input2_multiplier, input2_shift);
4220           output_data[Offset(output_shape, b, y, x, c)] =
4221               F(scaled_input1_val, scaled_input2_val);
4222         }
4223       }
4224     }
4225   }
4226 }
4227 
4228 #define TFLITE_COMPARISON_OP(name)                                             \
4229   inline void name(const ComparisonParams& op_params,                          \
4230                    const RuntimeShape& input1_shape, const float* input1_data, \
4231                    const RuntimeShape& input2_shape, const float* input2_data, \
4232                    const RuntimeShape& output_shape, bool* output_data) {      \
4233     gemmlowp::ScopedProfilingLabel label(#name);                               \
4234     Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape,   \
4235                          input2_data, output_shape, output_data);              \
4236   }                                                                            \
4237   template <typename T>                                                        \
4238   inline void name##NoScaling(                                                 \
4239       const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
4240       const T* input1_data, const RuntimeShape& input2_shape,                  \
4241       const T* input2_data, const RuntimeShape& output_shape,                  \
4242       bool* output_data) {                                                     \
4243     gemmlowp::ScopedProfilingLabel label(#name "NoScaling");                   \
4244     ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data,          \
4245                                 input2_shape, input2_data, output_shape,       \
4246                                 output_data);                                  \
4247   }                                                                            \
4248   template <typename T>                                                        \
4249   inline void name##WithScaling(                                               \
4250       const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
4251       const T* input1_data, const RuntimeShape& input2_shape,                  \
4252       const T* input2_data, const RuntimeShape& output_shape,                  \
4253       bool* output_data) {                                                     \
4254     gemmlowp::ScopedProfilingLabel label(#name "WithScaling/8bit");            \
4255     ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data,   \
4256                                        input2_shape, input2_data,              \
4257                                        output_shape, output_data);             \
4258   }                                                                            \
4259   template <typename T>                                                        \
4260   inline void Broadcast4DSlow##name##NoScaling(                                \
4261       const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
4262       const T* input1_data, const RuntimeShape& input2_shape,                  \
4263       const T* input2_data, const RuntimeShape& output_shape,                  \
4264       bool* output_data) {                                                     \
4265     gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "NoScaling"); \
4266     BroadcastComparison4DSlowImpl<T, name##Fn>(                                \
4267         op_params, input1_shape, input1_data, input2_shape, input2_data,       \
4268         output_shape, output_data);                                            \
4269   }                                                                            \
4270   inline void Broadcast4DSlow##name(                                           \
4271       const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
4272       const float* input1_data, const RuntimeShape& input2_shape,              \
4273       const float* input2_data, const RuntimeShape& output_shape,              \
4274       bool* output_data) {                                                     \
4275     gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name);             \
4276     BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data,  \
4277                                         input2_shape, input2_data,             \
4278                                         output_shape, output_data);            \
4279   }                                                                            \
4280   template <typename T>                                                        \
4281   inline void Broadcast4DSlow##name##WithScaling(                              \
4282       const ComparisonParams& op_params, const RuntimeShape& input1_shape,     \
4283       const T* input1_data, const RuntimeShape& input2_shape,                  \
4284       const T* input2_data, const RuntimeShape& output_shape,                  \
4285       bool* output_data) {                                                     \
4286     gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "/8bit");     \
4287     BroadcastComparison4DSlowWithScaling<T, name##Fn>(                         \
4288         op_params, input1_shape, input1_data, input2_shape, input2_data,       \
4289         output_shape, output_data);                                            \
4290   }
4291 TFLITE_COMPARISON_OP(Equal);
4292 TFLITE_COMPARISON_OP(NotEqual);
4293 TFLITE_COMPARISON_OP(Greater);
4294 TFLITE_COMPARISON_OP(GreaterEqual);
4295 TFLITE_COMPARISON_OP(Less);
4296 TFLITE_COMPARISON_OP(LessEqual);
4297 #undef TFLITE_COMPARISON_OP
4298 
4299 template <typename D, typename T>
Select(const RuntimeShape & input_condition_shape,const D * input_condition_data,const RuntimeShape & input_x_shape,const T * input_x_data,const RuntimeShape & input_y_shape,const T * input_y_data,const RuntimeShape & output_shape,T * output_data)4300 void Select(const RuntimeShape& input_condition_shape,
4301             const D* input_condition_data, const RuntimeShape& input_x_shape,
4302             const T* input_x_data, const RuntimeShape& input_y_shape,
4303             const T* input_y_data, const RuntimeShape& output_shape,
4304             T* output_data) {
4305   const int64_t flatsize = MatchingFlatSize(
4306       input_condition_shape, input_x_shape, input_y_shape, output_shape);
4307   for (int64_t i = 0; i < flatsize; ++i) {
4308     output_data[i] =
4309         input_condition_data[i] ? input_x_data[i] : input_y_data[i];
4310   }
4311 }
4312 
4313 template <typename D, typename T>
RankOneSelect(const RuntimeShape & input_condition_shape,const D * input_condition_data,const RuntimeShape & input_x_shape,const T * input_x_data,const RuntimeShape & input_y_shape,const T * input_y_data,const RuntimeShape & output_shape,T * output_data)4314 void RankOneSelect(const RuntimeShape& input_condition_shape,
4315                    const D* input_condition_data,
4316                    const RuntimeShape& input_x_shape, const T* input_x_data,
4317                    const RuntimeShape& input_y_shape, const T* input_y_data,
4318                    const RuntimeShape& output_shape, T* output_data) {
4319   const int64_t outer_size = input_condition_shape.FlatSize();
4320   TFLITE_DCHECK_EQ(
4321       MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0),
4322       outer_size);
4323   const int64_t inner_size =
4324       MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
4325 
4326   int64_t offset = 0;
4327   for (int64_t i = 0; i < outer_size; i++) {
4328     const T* input_data = input_condition_data[i] ? input_x_data : input_y_data;
4329     memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
4330     offset += inner_size;
4331   }
4332 }
4333 
4334 template <typename D, typename T>
SelectTrueCoords(const RuntimeShape & input_condition_shape,const D * input_condition_data,T * output_data)4335 void SelectTrueCoords(const RuntimeShape& input_condition_shape,
4336                       const D* input_condition_data, T* output_data) {
4337   const size_t size = input_condition_shape.FlatSize();
4338   const size_t cond_rank = input_condition_shape.DimensionsCount();
4339 
4340   std::vector<int> dims_to_count(cond_rank, 0);
4341   int cur_flat_size = size;
4342   for (int i = 0; i < cond_rank; ++i) {
4343     dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
4344     cur_flat_size = dims_to_count[i];
4345   }
4346 
4347   int output_index = 0;
4348   for (int i = 0; i < size; ++i) {
4349     if (input_condition_data[i]) {
4350       // Insert the coordinate of the current item (row major) into output.
4351       int flat_index = i;
4352       for (int j = 0; j < cond_rank; ++j) {
4353         int coord_j = flat_index / dims_to_count[j];
4354         output_data[output_index * cond_rank + j] = coord_j;
4355         flat_index %= dims_to_count[j];
4356       }
4357       output_index++;
4358     }
4359   }
4360 }
4361 
4362 // For easy implementation, the indices is always a vector of size-4 vectors.
4363 template <typename T, typename TI>
SparseToDense(const std::vector<std::vector<TI>> & indices,const T * values,T default_value,bool value_is_scalar,const RuntimeShape & unextended_output_shape,T * output_data)4364 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
4365                           const T* values, T default_value,
4366                           bool value_is_scalar,
4367                           const RuntimeShape& unextended_output_shape,
4368                           T* output_data) {
4369   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4370   const RuntimeShape output_shape =
4371       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4372   const int value_count = indices.size();
4373 
4374   // First fill the output_data with default value.
4375   const int num_elements = output_shape.FlatSize();
4376   for (int i = 0; i < num_elements; ++i) {
4377     output_data[i] = default_value;
4378   }
4379 
4380   // Special handle for value is scalar case to avoid checking the boolean
4381   // condition within the loop every time.
4382   if (value_is_scalar) {
4383     for (int i = 0; i < value_count; ++i) {
4384       const std::vector<TI>& index = indices[i];
4385       TFLITE_DCHECK_EQ(index.size(), 4);
4386       const T value = *values;  // just use the first value.
4387       output_data[Offset(output_shape, index[0], index[1], index[2],
4388                          index[3])] = value;
4389     }
4390     return;
4391   }
4392 
4393   // Go through the values and indices to fill the sparse values.
4394   for (int i = 0; i < value_count; ++i) {
4395     const std::vector<TI>& index = indices[i];
4396     TFLITE_DCHECK_EQ(index.size(), 4);
4397     const T value = values[i];
4398     output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] =
4399         value;
4400   }
4401 }
4402 
4403 template <typename T>
Pow(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)4404 inline void Pow(const RuntimeShape& input1_shape, const T* input1_data,
4405                 const RuntimeShape& input2_shape, const T* input2_data,
4406                 const RuntimeShape& output_shape, T* output_data) {
4407   const int flat_size =
4408       MatchingFlatSize(input1_shape, input2_shape, output_shape);
4409   for (int i = 0; i < flat_size; ++i) {
4410     output_data[i] = std::pow(input1_data[i], input2_data[i]);
4411   }
4412 }
4413 
4414 template <typename T>
BroadcastPow4DSlow(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)4415 inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape,
4416                                const T* input1_data,
4417                                const RuntimeShape& unextended_input2_shape,
4418                                const T* input2_data,
4419                                const RuntimeShape& unextended_output_shape,
4420                                T* output_data) {
4421   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4422   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4423   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4424   const RuntimeShape output_shape =
4425       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4426 
4427   NdArrayDesc<4> desc1;
4428   NdArrayDesc<4> desc2;
4429   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4430                                       unextended_input2_shape, &desc1, &desc2);
4431 
4432   for (int b = 0; b < output_shape.Dims(0); ++b) {
4433     for (int y = 0; y < output_shape.Dims(1); ++y) {
4434       for (int x = 0; x < output_shape.Dims(2); ++x) {
4435         for (int c = 0; c < output_shape.Dims(3); ++c) {
4436           auto out_idx = Offset(output_shape, b, y, x, c);
4437           auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4438           auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4439           auto in1_val = input1_data[in1_idx];
4440           auto in2_val = input2_data[in2_idx];
4441           output_data[out_idx] = std::pow(in1_val, in2_val);
4442         }
4443       }
4444     }
4445   }
4446 }
4447 
Logical(const RuntimeShape & input1_shape,const bool * input1_data,const RuntimeShape & input2_shape,const bool * input2_data,const RuntimeShape & output_shape,bool * output_data,const std::function<bool (bool,bool)> & func)4448 inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data,
4449                     const RuntimeShape& input2_shape, const bool* input2_data,
4450                     const RuntimeShape& output_shape, bool* output_data,
4451                     const std::function<bool(bool, bool)>& func) {
4452   const int flat_size =
4453       MatchingFlatSize(input1_shape, input2_shape, output_shape);
4454   for (int i = 0; i < flat_size; ++i) {
4455     output_data[i] = func(input1_data[i], input2_data[i]);
4456   }
4457 }
4458 
BroadcastLogical4DSlow(const RuntimeShape & unextended_input1_shape,const bool * input1_data,const RuntimeShape & unextended_input2_shape,const bool * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data,const std::function<bool (bool,bool)> & func)4459 inline void BroadcastLogical4DSlow(
4460     const RuntimeShape& unextended_input1_shape, const bool* input1_data,
4461     const RuntimeShape& unextended_input2_shape, const bool* input2_data,
4462     const RuntimeShape& unextended_output_shape, bool* output_data,
4463     const std::function<bool(bool, bool)>& func) {
4464   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4465   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4466   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4467   const RuntimeShape output_shape =
4468       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4469 
4470   NdArrayDesc<4> desc1;
4471   NdArrayDesc<4> desc2;
4472   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4473                                       unextended_input2_shape, &desc1, &desc2);
4474 
4475   for (int b = 0; b < output_shape.Dims(0); ++b) {
4476     for (int y = 0; y < output_shape.Dims(1); ++y) {
4477       for (int x = 0; x < output_shape.Dims(2); ++x) {
4478         for (int c = 0; c < output_shape.Dims(3); ++c) {
4479           auto out_idx = Offset(output_shape, b, y, x, c);
4480           auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4481           auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4482           auto in1_val = input1_data[in1_idx];
4483           auto in2_val = input2_data[in2_idx];
4484           output_data[out_idx] = func(in1_val, in2_val);
4485         }
4486       }
4487     }
4488   }
4489 }
4490 
4491 // TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
4492 // generalized and efficient BroadcastBinaryFunction.
4493 //
4494 // Also appears to duplicte MinimumMaximum.
4495 //
4496 // R: Result type. T1: Input 1 type. T2: Input 2 type.
4497 template <typename R, typename T1, typename T2>
BroadcastBinaryFunction4DSlow(const RuntimeShape & unextended_input1_shape,const T1 * input1_data,const RuntimeShape & unextended_input2_shape,const T2 * input2_data,const RuntimeShape & unextended_output_shape,R * output_data,R (* func)(T1,T2))4498 inline void BroadcastBinaryFunction4DSlow(
4499     const RuntimeShape& unextended_input1_shape, const T1* input1_data,
4500     const RuntimeShape& unextended_input2_shape, const T2* input2_data,
4501     const RuntimeShape& unextended_output_shape, R* output_data,
4502     R (*func)(T1, T2)) {
4503   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4504   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4505   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4506   const RuntimeShape output_shape =
4507       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4508 
4509   NdArrayDesc<4> desc1;
4510   NdArrayDesc<4> desc2;
4511   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4512                                       unextended_input2_shape, &desc1, &desc2);
4513 
4514   for (int b = 0; b < output_shape.Dims(0); ++b) {
4515     for (int y = 0; y < output_shape.Dims(1); ++y) {
4516       for (int x = 0; x < output_shape.Dims(2); ++x) {
4517         for (int c = 0; c < output_shape.Dims(3); ++c) {
4518           auto out_idx = Offset(output_shape, b, y, x, c);
4519           auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4520           auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4521           auto in1_val = input1_data[in1_idx];
4522           auto in2_val = input2_data[in2_idx];
4523           output_data[out_idx] = func(in1_val, in2_val);
4524         }
4525       }
4526     }
4527   }
4528 }
4529 
4530 // R: Result type. T1: Input 1 type. T2: Input 2 type.
4531 // TODO(renjieliu): Refactor other binary functions to use this one.
4532 template <typename R, typename T1, typename T2>
BinaryFunction(const RuntimeShape & input1_shape,const T1 * input1_data,const RuntimeShape & input2_shape,const T2 * input2_data,const RuntimeShape & output_shape,R * output_data,R (* func)(T1,T2))4533 inline void BinaryFunction(const RuntimeShape& input1_shape,
4534                            const T1* input1_data,
4535                            const RuntimeShape& input2_shape,
4536                            const T2* input2_data,
4537                            const RuntimeShape& output_shape, R* output_data,
4538                            R (*func)(T1, T2)) {
4539   const int flat_size =
4540       MatchingFlatSize(input1_shape, input2_shape, output_shape);
4541   for (int i = 0; i < flat_size; ++i) {
4542     output_data[i] = func(input1_data[i], input2_data[i]);
4543   }
4544 }
4545 
4546 template <typename T>
ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & output_size_shape,const int32 * output_size_data,const RuntimeShape & unextended_output_shape,T * output_data)4547 inline void ResizeNearestNeighbor(
4548     const tflite::ResizeNearestNeighborParams& op_params,
4549     const RuntimeShape& unextended_input_shape, const T* input_data,
4550     const RuntimeShape& output_size_shape, const int32* output_size_data,
4551     const RuntimeShape& unextended_output_shape, T* output_data) {
4552   // Align corners = true is not supported.
4553   TFLITE_DCHECK(!op_params.align_corners);
4554   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
4555   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4556 
4557   const RuntimeShape input_shape =
4558       RuntimeShape::ExtendedShape(4, unextended_input_shape);
4559   const RuntimeShape output_shape =
4560       RuntimeShape::ExtendedShape(4, unextended_output_shape);
4561 
4562   int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
4563   int32 input_height = input_shape.Dims(1);
4564   int32 input_width = input_shape.Dims(2);
4565   int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
4566 
4567   // The Tensorflow version of this op allows resize on the width and height
4568   // axis only.
4569   TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
4570   int32 output_height = output_size_data[0];
4571   int32 output_width = output_size_data[1];
4572 
4573   // We use float to ensure agreement with the Tensorflow implementation.
4574   const float height_scale = static_cast<float>(input_height) / output_height;
4575   const float width_scale = static_cast<float>(input_width) / output_width;
4576 
4577   const int col_offset = input_shape.Dims(3);
4578   const int row_offset = input_shape.Dims(2) * col_offset;
4579   const int batch_offset = input_shape.Dims(1) * row_offset;
4580 
4581   const T* input_ptr = input_data;
4582   T* output_ptr = output_data;
4583   for (int b = 0; b < batches; ++b) {
4584     for (int y = 0; y < output_height; ++y) {
4585       int32 in_y = std::min(static_cast<int32>(std::floor(y * height_scale)),
4586                             input_height - 1);
4587       const T* y_input_ptr = input_ptr + in_y * row_offset;
4588       for (int x = 0; x < output_width; ++x) {
4589         int32 in_x = std::min(static_cast<int32>(std::floor(x * width_scale)),
4590                               input_width - 1);
4591         const T* x_input_ptr = y_input_ptr + in_x * col_offset;
4592         memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
4593         output_ptr += depth;
4594       }
4595     }
4596     input_ptr += batch_offset;
4597   }
4598 }
4599 
BroadcastPrelu4DSlow(const PreluParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & alpha_shape,const uint8 * alpha_data,const RuntimeShape & output_shape,uint8 * output_data)4600 inline void BroadcastPrelu4DSlow(const PreluParams& params,
4601                                  const RuntimeShape& input_shape,
4602                                  const uint8* input_data,
4603                                  const RuntimeShape& alpha_shape,
4604                                  const uint8* alpha_data,
4605                                  const RuntimeShape& output_shape,
4606                                  uint8* output_data) {
4607   TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
4608   TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
4609   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
4610   const RuntimeShape extended_output_shape =
4611       RuntimeShape::ExtendedShape(4, output_shape);
4612   NdArrayDesc<4> desc1;
4613   NdArrayDesc<4> desc2;
4614   NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
4615 
4616   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
4617     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
4618       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
4619         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
4620           int output_index = Offset(extended_output_shape, b, y, x, c);
4621           int input_index = SubscriptToIndex(desc1, b, y, x, c);
4622           const int32 input_value =
4623               params.input_offset + input_data[input_index];
4624           if (input_value >= 0) {
4625             output_data[output_index] = input_data[input_index];
4626           } else {
4627             auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
4628             const int32 alpha_value =
4629                 params.alpha_offset + alpha_data[alpha_index];
4630             const int32 unclamped_output =
4631                 params.output_offset +
4632                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4633                     input_value * alpha_value, params.output_multiplier,
4634                     params.output_shift);
4635             const int32 quantized_min = std::numeric_limits<uint8_t>::min();
4636             const int32 quantized_max = std::numeric_limits<uint8_t>::max();
4637             const int32 clamped_output = std::min(
4638                 quantized_max, std::max(quantized_min, unclamped_output));
4639             output_data[output_index] = static_cast<uint8>(clamped_output);
4640           }
4641         }
4642       }
4643     }
4644   }
4645 }
4646 
4647 template <typename T>
Fill(const RuntimeShape & value_shape,const T * value_data,const RuntimeShape & output_shape,T * output_data)4648 void Fill(const RuntimeShape& value_shape, const T* value_data,
4649           const RuntimeShape& output_shape, T* output_data) {
4650   TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
4651   const int flat_size = output_shape.FlatSize();
4652   for (int i = 0; i < flat_size; ++i) {
4653     output_data[i] = *value_data;
4654   }
4655 }
4656 
4657 template <typename Scalar>
Reverse(int axis,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * output_data)4658 void Reverse(int axis, const RuntimeShape& input_shape,
4659              const Scalar* input_data, const RuntimeShape& output_shape,
4660              Scalar* output_data) {
4661   gemmlowp::ScopedProfilingLabel label("Reverse");
4662 
4663   int outer_size = 1;
4664   for (int i = 0; i < axis; ++i) {
4665     outer_size *= input_shape.Dims(i);
4666   }
4667 
4668   int copy_size = 1;
4669   for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
4670     copy_size *= input_shape.Dims(i);
4671   }
4672 
4673   const int dims_at_axis = input_shape.Dims(axis);
4674   for (int i = 0; i < outer_size; ++i) {
4675     for (int j = 0; j < dims_at_axis; ++j) {
4676       const int start_pos = (i * dims_at_axis + j) * copy_size;
4677       Scalar* output_ptr = output_data + start_pos;
4678       int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
4679       memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
4680     }
4681   }
4682 }
4683 
4684 template <typename Scalar, typename TS>
ReverseSequence(const TS * seq_lengths,const int seq_dim,const int batch_dim,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * output_data)4685 void ReverseSequence(const TS* seq_lengths, const int seq_dim,
4686                      const int batch_dim, const RuntimeShape& input_shape,
4687                      const Scalar* input_data, const RuntimeShape& output_shape,
4688                      Scalar* output_data) {
4689   gemmlowp::ScopedProfilingLabel label("ReverseSequence");
4690 
4691   int outer_size = 1;
4692   int outer_dim = std::min(batch_dim, seq_dim);
4693   int medium_dim = std::max(batch_dim, seq_dim);
4694   for (int i = 0; i < outer_dim; ++i) {
4695     outer_size *= input_shape.Dims(i);
4696   }
4697 
4698   int medium_size = 1;
4699   for (int i = outer_dim + 1; i < medium_dim; ++i) {
4700     medium_size *= input_shape.Dims(i);
4701   }
4702 
4703   int copy_size = 1;
4704   for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i) {
4705     copy_size *= input_shape.Dims(i);
4706   }
4707 
4708   const int dims_at_outer_dim = input_shape.Dims(outer_dim);
4709   const int dims_at_medium_dim = input_shape.Dims(medium_dim);
4710 
4711   Scalar* output_ptr;
4712   if (batch_dim > seq_dim) {
4713     for (int i = 0; i < outer_size; ++i) {
4714       for (int j = 0; j < dims_at_outer_dim; ++j) {
4715         const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4716         for (int p = 0; p < medium_size; ++p) {
4717           for (int q = 0; q < dims_at_medium_dim; ++q) {
4718             const int in_pos =
4719                 ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4720             const Scalar* in_ptr = input_data + in_pos;
4721             int sl = seq_lengths[q] - 1;
4722             if (j > sl) {
4723               output_ptr = output_data + in_pos;
4724             } else {
4725               const int out_pos_base =
4726                   (i * dims_at_outer_dim + sl - j) * medium_size;
4727               const int out_pos =
4728                   ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4729               output_ptr = output_data + out_pos;
4730             }
4731             memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
4732           }
4733         }
4734       }
4735     }
4736   } else if (batch_dim < seq_dim) {
4737     for (int i = 0; i < outer_size; ++i) {
4738       for (int j = 0; j < dims_at_outer_dim; ++j) {
4739         const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4740         int sl = seq_lengths[j] - 1;
4741         const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4742         for (int p = 0; p < medium_size; ++p) {
4743           for (int q = 0; q < dims_at_medium_dim; ++q) {
4744             const int in_pos =
4745                 ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4746             const Scalar* in_ptr = input_data + in_pos;
4747             if (q > sl) {
4748               output_ptr = output_data + in_pos;
4749             } else {
4750               const int out_pos =
4751                   ((out_pos_base + p) * dims_at_medium_dim + sl - q) *
4752                   copy_size;
4753               output_ptr = output_data + out_pos;
4754             }
4755             memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
4756           }
4757         }
4758       }
4759     }
4760   }
4761 }
4762 
4763 }  // namespace reference_ops
4764 }  // namespace tflite
4765 
4766 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
4767