1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
17 
18 #include <limits>
19 #include "public/gemmlowp.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/types.h"
22 
23 namespace tflite {
24 namespace reference_integer_ops {
25 
26 // Element-wise add that can often be used for inner loop of broadcast add as
27 // well as the non-broadcast add.
AddElementwise(int size,const ArithmeticParams & params,const int8_t * input1_data,const int8_t * input2_data,int8_t * output_data)28 inline void AddElementwise(int size, const ArithmeticParams& params,
29                            const int8_t* input1_data, const int8_t* input2_data,
30                            int8_t* output_data) {
31   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
32   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
33   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
34   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
35   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
36 
37   for (int i = 0; i < size; ++i) {
38     const int32 input1_val = params.input1_offset + input1_data[i];
39     const int32 input2_val = params.input2_offset + input2_data[i];
40     const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
41     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
42     const int32 scaled_input1_val =
43         MultiplyByQuantizedMultiplierSmallerThanOneExp(
44             shifted_input1_val, params.input1_multiplier, params.input1_shift);
45     const int32 scaled_input2_val =
46         MultiplyByQuantizedMultiplierSmallerThanOneExp(
47             shifted_input2_val, params.input2_multiplier, params.input2_shift);
48     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
49     const int32 raw_output =
50         MultiplyByQuantizedMultiplierSmallerThanOneExp(
51             raw_sum, params.output_multiplier, params.output_shift) +
52         params.output_offset;
53     const int32 clamped_output =
54         std::min(params.quantized_activation_max,
55                  std::max(params.quantized_activation_min, raw_output));
56     output_data[i] = static_cast<int8_t>(clamped_output);
57   }
58 }
59 
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)60 inline void Add(const ArithmeticParams& params,
61                 const RuntimeShape& input1_shape, const int8_t* input1_data,
62                 const RuntimeShape& input2_shape, const int8_t* input2_data,
63                 const RuntimeShape& output_shape, int8_t* output_data) {
64   TFLITE_DCHECK_LE(params.quantized_activation_min,
65                    params.quantized_activation_max);
66   const int flat_size =
67       MatchingFlatSize(input1_shape, input2_shape, output_shape);
68 
69   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
70   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
71   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
72   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
73   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
74   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
75 }
76 
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)77 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
78                                const RuntimeShape& input1_shape,
79                                const int8_t* input1_data,
80                                const RuntimeShape& input2_shape,
81                                const int8_t* input2_data,
82                                const RuntimeShape& output_shape,
83                                int8_t* output_data) {
84   gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int8");
85   NdArrayDesc<4> desc1;
86   NdArrayDesc<4> desc2;
87   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
88                                       &desc2);
89   const RuntimeShape extended_output_shape =
90       RuntimeShape::ExtendedShape(4, output_shape);
91 
92   // In Tensorflow, the dimensions are canonically named (batch_number, row,
93   // col, channel), with extents (batches, height, width, depth), with the
94   // trailing dimension changing most rapidly (channels has the smallest stride,
95   // typically 1 element).
96   //
97   // In generated C code, we store arrays with the dimensions reversed. The
98   // first dimension has smallest stride.
99   //
100   // We name our variables by their Tensorflow convention, but generate C code
101   // nesting loops such that the innermost loop has the smallest stride for the
102   // best cache behavior.
103   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
104     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
105       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
106         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
107           const int32_t input1_val =
108               params.input1_offset +
109               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
110           const int32_t input2_val =
111               params.input2_offset +
112               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
113           const int32_t shifted_input1_val =
114               input1_val * (1 << params.left_shift);
115           const int32_t shifted_input2_val =
116               input2_val * (1 << params.left_shift);
117           const int32_t scaled_input1_val =
118               MultiplyByQuantizedMultiplierSmallerThanOneExp(
119                   shifted_input1_val, params.input1_multiplier,
120                   params.input1_shift);
121           const int32_t scaled_input2_val =
122               MultiplyByQuantizedMultiplierSmallerThanOneExp(
123                   shifted_input2_val, params.input2_multiplier,
124                   params.input2_shift);
125           const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
126           const int32_t raw_output =
127               MultiplyByQuantizedMultiplierSmallerThanOneExp(
128                   raw_sum, params.output_multiplier, params.output_shift) +
129               params.output_offset;
130           const int32_t clamped_output =
131               std::min(params.quantized_activation_max,
132                        std::max(params.quantized_activation_min, raw_output));
133           output_data[Offset(extended_output_shape, b, y, x, c)] =
134               static_cast<int8_t>(clamped_output);
135         }
136       }
137     }
138   }
139 }
140 
141 }  // namespace reference_integer_ops
142 }  // namespace tflite
143 
144 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
145