1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17 
18 #include <algorithm>
19 
20 #include "tensorflow/lite/kernels/internal/common.h"
21 
22 namespace tflite {
23 
24 namespace reference_ops {
25 
26 // Element-wise div that can often be used for inner loop of broadcast Div as
27 // well as the non-broadcast Div.
DivElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)28 inline void DivElementwise(int size, const ArithmeticParams& params,
29                            const uint8* input1_data, const uint8* input2_data,
30                            uint8* output_data) {
31   TFLITE_DCHECK_GT(params.input1_offset, -256);
32   TFLITE_DCHECK_LT(params.input1_offset, 256);
33   TFLITE_DCHECK_GT(params.input2_offset, -256);
34   TFLITE_DCHECK_LT(params.input2_offset, 256);
35   TFLITE_DCHECK_GT(params.output_offset, -256);
36   TFLITE_DCHECK_LT(params.output_offset, 256);
37 
38   for (int i = 0; i < size; ++i) {
39     const int32 input1_val = params.input1_offset + input1_data[i];
40     const int32 input2_val = params.input2_offset + input2_data[i];
41     TFLITE_DCHECK_NE(input2_val, 0);
42     int recip_shift;
43     const int32 input2_inv =
44         (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
45                          : -GetReciprocal(-input2_val, 31, &recip_shift);
46     const int headroom = CountLeadingSignBits(input1_val);
47     const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
48         input1_val, input2_inv, headroom);
49     const int total_shift = params.output_shift - recip_shift - headroom;
50     const int32 unclamped_result =
51         params.output_offset +
52         MultiplyByQuantizedMultiplierSmallerThanOneExp(
53             unscaled_quotient, params.output_multiplier, total_shift);
54     const int32 clamped_output =
55         std::min(params.quantized_activation_max,
56                  std::max(params.quantized_activation_min, unclamped_result));
57     output_data[i] = static_cast<uint8>(clamped_output);
58   }
59 }
60 
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)61 inline void Div(const ArithmeticParams& params,
62                 const RuntimeShape& input1_shape, const uint8* input1_data,
63                 const RuntimeShape& input2_shape, const uint8* input2_data,
64                 const RuntimeShape& output_shape, uint8* output_data) {
65   TFLITE_DCHECK_LE(params.quantized_activation_min,
66                    params.quantized_activation_max);
67   const int flat_size =
68       MatchingElementsSize(input1_shape, input2_shape, output_shape);
69 
70   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
71 }
72 
73 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8 * input1_data,const RuntimeShape & unextended_input2_shape,const uint8 * input2_data,const RuntimeShape & unextended_output_shape,uint8 * output_data)74 inline void BroadcastDivSlow(const ArithmeticParams& params,
75                              const RuntimeShape& unextended_input1_shape,
76                              const uint8* input1_data,
77                              const RuntimeShape& unextended_input2_shape,
78                              const uint8* input2_data,
79                              const RuntimeShape& unextended_output_shape,
80                              uint8* output_data) {
81   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
82   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
83   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
84 
85   NdArrayDesc<N> desc1;
86   NdArrayDesc<N> desc2;
87   NdArrayDesc<N> output_desc;
88   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
89                                       unextended_input2_shape, &desc1, &desc2);
90   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
91                  &output_desc);
92 
93   TFLITE_DCHECK_GT(params.input1_offset, -256);
94   TFLITE_DCHECK_LT(params.input1_offset, 256);
95   TFLITE_DCHECK_GT(params.input2_offset, -256);
96   TFLITE_DCHECK_LT(params.input2_offset, 256);
97   TFLITE_DCHECK_GT(params.output_offset, -256);
98   TFLITE_DCHECK_LT(params.output_offset, 256);
99 
100   auto div_func = [&](int indexes[N]) {
101     const int32 input1_val =
102         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
103     const int32 input2_val =
104         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
105     TFLITE_DCHECK_NE(input2_val, 0);
106     int recip_shift;
107     const int32 input2_inv =
108         (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
109                          : -GetReciprocal(-input2_val, 31, &recip_shift);
110     const int headroom = CountLeadingSignBits(input1_val);
111     const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
112         input1_val, input2_inv, headroom);
113     const int total_shift = params.output_shift - recip_shift - headroom;
114     const int32 unclamped_result =
115         params.output_offset +
116         MultiplyByQuantizedMultiplierSmallerThanOneExp(
117             unscaled_quotient, params.output_multiplier, total_shift);
118     const int32 clamped_output =
119         std::min(params.quantized_activation_max,
120                  std::max(params.quantized_activation_min, unclamped_result));
121     output_data[SubscriptToIndex(output_desc, indexes)] =
122         static_cast<uint8>(clamped_output);
123   };
124   NDOpsHelper<N>(output_desc, div_func);
125 }
126 
127 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
128 // dimensionality if the runtime code does a single loop over one dimension
129 // that handles broadcasting as the base case. The code generator would then
130 // generate max(D1, D2) nested for loops.
131 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)132 void BroadcastDivSlow(const ArithmeticParams& params,
133                       const RuntimeShape& unextended_input1_shape,
134                       const T* input1_data,
135                       const RuntimeShape& unextended_input2_shape,
136                       const T* input2_data,
137                       const RuntimeShape& unextended_output_shape,
138                       T* output_data) {
139   T output_activation_min;
140   T output_activation_max;
141   GetActivationParams(params, &output_activation_min, &output_activation_max);
142 
143   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
144   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
145   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
146 
147   NdArrayDesc<N> desc1;
148   NdArrayDesc<N> desc2;
149   NdArrayDesc<N> output_desc;
150   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
151                                       unextended_input2_shape, &desc1, &desc2);
152   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
153                  &output_desc);
154 
155   // In Tensorflow, the dimensions are canonically named (batch_number, row,
156   // col, channel), with extents (batches, height, width, depth), with the
157   // trailing dimension changing most rapidly (channels has the smallest
158   // stride, typically 1 element).
159   //
160   // In generated C code, we store arrays with the dimensions reversed. The
161   // first dimension has smallest stride.
162 
163   auto div_func = [&](int indexes[N]) {
164     output_data[SubscriptToIndex(output_desc, indexes)] =
165         ActivationFunctionWithMinMax(
166             input1_data[SubscriptToIndex(desc1, indexes)] /
167                 input2_data[SubscriptToIndex(desc2, indexes)],
168             output_activation_min, output_activation_max);
169   };
170   NDOpsHelper<N>(output_desc, div_func);
171 }
172 
173 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)174 inline void Div(const ArithmeticParams& params,
175                 const RuntimeShape& input1_shape, const T* input1_data,
176                 const RuntimeShape& input2_shape, const T* input2_data,
177                 const RuntimeShape& output_shape, T* output_data) {
178   T output_activation_min;
179   T output_activation_max;
180   GetActivationParams(params, &output_activation_min, &output_activation_max);
181 
182   const int flat_size =
183       MatchingElementsSize(input1_shape, input2_shape, output_shape);
184   for (int i = 0; i < flat_size; ++i) {
185     output_data[i] = ActivationFunctionWithMinMax(
186         input1_data[i] / input2_data[i], output_activation_min,
187         output_activation_max);
188   }
189 }
190 
191 }  // namespace reference_ops
192 }  // namespace tflite
193 
194 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
195