1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17
18 #include <algorithm>
19
20 #include "tensorflow/lite/kernels/internal/common.h"
21
22 namespace tflite {
23
24 namespace reference_ops {
25
26 // Element-wise div that can often be used for inner loop of broadcast Div as
27 // well as the non-broadcast Div.
DivElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)28 inline void DivElementwise(int size, const ArithmeticParams& params,
29 const uint8* input1_data, const uint8* input2_data,
30 uint8* output_data) {
31 TFLITE_DCHECK_GT(params.input1_offset, -256);
32 TFLITE_DCHECK_LT(params.input1_offset, 256);
33 TFLITE_DCHECK_GT(params.input2_offset, -256);
34 TFLITE_DCHECK_LT(params.input2_offset, 256);
35 TFLITE_DCHECK_GT(params.output_offset, -256);
36 TFLITE_DCHECK_LT(params.output_offset, 256);
37
38 for (int i = 0; i < size; ++i) {
39 const int32 input1_val = params.input1_offset + input1_data[i];
40 const int32 input2_val = params.input2_offset + input2_data[i];
41 TFLITE_DCHECK_NE(input2_val, 0);
42 int recip_shift;
43 const int32 input2_inv =
44 (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
45 : -GetReciprocal(-input2_val, 31, &recip_shift);
46 const int headroom = CountLeadingSignBits(input1_val);
47 const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
48 input1_val, input2_inv, headroom);
49 const int total_shift = params.output_shift - recip_shift - headroom;
50 const int32 unclamped_result =
51 params.output_offset +
52 MultiplyByQuantizedMultiplierSmallerThanOneExp(
53 unscaled_quotient, params.output_multiplier, total_shift);
54 const int32 clamped_output =
55 std::min(params.quantized_activation_max,
56 std::max(params.quantized_activation_min, unclamped_result));
57 output_data[i] = static_cast<uint8>(clamped_output);
58 }
59 }
60
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)61 inline void Div(const ArithmeticParams& params,
62 const RuntimeShape& input1_shape, const uint8* input1_data,
63 const RuntimeShape& input2_shape, const uint8* input2_data,
64 const RuntimeShape& output_shape, uint8* output_data) {
65 TFLITE_DCHECK_LE(params.quantized_activation_min,
66 params.quantized_activation_max);
67 const int flat_size =
68 MatchingElementsSize(input1_shape, input2_shape, output_shape);
69
70 DivElementwise(flat_size, params, input1_data, input2_data, output_data);
71 }
72
73 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8 * input1_data,const RuntimeShape & unextended_input2_shape,const uint8 * input2_data,const RuntimeShape & unextended_output_shape,uint8 * output_data)74 inline void BroadcastDivSlow(const ArithmeticParams& params,
75 const RuntimeShape& unextended_input1_shape,
76 const uint8* input1_data,
77 const RuntimeShape& unextended_input2_shape,
78 const uint8* input2_data,
79 const RuntimeShape& unextended_output_shape,
80 uint8* output_data) {
81 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
82 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
83 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
84
85 NdArrayDesc<N> desc1;
86 NdArrayDesc<N> desc2;
87 NdArrayDesc<N> output_desc;
88 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
89 unextended_input2_shape, &desc1, &desc2);
90 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
91 &output_desc);
92
93 TFLITE_DCHECK_GT(params.input1_offset, -256);
94 TFLITE_DCHECK_LT(params.input1_offset, 256);
95 TFLITE_DCHECK_GT(params.input2_offset, -256);
96 TFLITE_DCHECK_LT(params.input2_offset, 256);
97 TFLITE_DCHECK_GT(params.output_offset, -256);
98 TFLITE_DCHECK_LT(params.output_offset, 256);
99
100 auto div_func = [&](int indexes[N]) {
101 const int32 input1_val =
102 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
103 const int32 input2_val =
104 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
105 TFLITE_DCHECK_NE(input2_val, 0);
106 int recip_shift;
107 const int32 input2_inv =
108 (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
109 : -GetReciprocal(-input2_val, 31, &recip_shift);
110 const int headroom = CountLeadingSignBits(input1_val);
111 const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
112 input1_val, input2_inv, headroom);
113 const int total_shift = params.output_shift - recip_shift - headroom;
114 const int32 unclamped_result =
115 params.output_offset +
116 MultiplyByQuantizedMultiplierSmallerThanOneExp(
117 unscaled_quotient, params.output_multiplier, total_shift);
118 const int32 clamped_output =
119 std::min(params.quantized_activation_max,
120 std::max(params.quantized_activation_min, unclamped_result));
121 output_data[SubscriptToIndex(output_desc, indexes)] =
122 static_cast<uint8>(clamped_output);
123 };
124 NDOpsHelper<N>(output_desc, div_func);
125 }
126
127 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
128 // dimensionality if the runtime code does a single loop over one dimension
129 // that handles broadcasting as the base case. The code generator would then
130 // generate max(D1, D2) nested for loops.
131 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)132 void BroadcastDivSlow(const ArithmeticParams& params,
133 const RuntimeShape& unextended_input1_shape,
134 const T* input1_data,
135 const RuntimeShape& unextended_input2_shape,
136 const T* input2_data,
137 const RuntimeShape& unextended_output_shape,
138 T* output_data) {
139 T output_activation_min;
140 T output_activation_max;
141 GetActivationParams(params, &output_activation_min, &output_activation_max);
142
143 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
144 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
145 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
146
147 NdArrayDesc<N> desc1;
148 NdArrayDesc<N> desc2;
149 NdArrayDesc<N> output_desc;
150 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
151 unextended_input2_shape, &desc1, &desc2);
152 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
153 &output_desc);
154
155 // In Tensorflow, the dimensions are canonically named (batch_number, row,
156 // col, channel), with extents (batches, height, width, depth), with the
157 // trailing dimension changing most rapidly (channels has the smallest
158 // stride, typically 1 element).
159 //
160 // In generated C code, we store arrays with the dimensions reversed. The
161 // first dimension has smallest stride.
162
163 auto div_func = [&](int indexes[N]) {
164 output_data[SubscriptToIndex(output_desc, indexes)] =
165 ActivationFunctionWithMinMax(
166 input1_data[SubscriptToIndex(desc1, indexes)] /
167 input2_data[SubscriptToIndex(desc2, indexes)],
168 output_activation_min, output_activation_max);
169 };
170 NDOpsHelper<N>(output_desc, div_func);
171 }
172
173 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)174 inline void Div(const ArithmeticParams& params,
175 const RuntimeShape& input1_shape, const T* input1_data,
176 const RuntimeShape& input2_shape, const T* input2_data,
177 const RuntimeShape& output_shape, T* output_data) {
178 T output_activation_min;
179 T output_activation_max;
180 GetActivationParams(params, &output_activation_min, &output_activation_max);
181
182 const int flat_size =
183 MatchingElementsSize(input1_shape, input2_shape, output_shape);
184 for (int i = 0; i < flat_size; ++i) {
185 output_data[i] = ActivationFunctionWithMinMax(
186 input1_data[i] / input2_data[i], output_activation_min,
187 output_activation_max);
188 }
189 }
190
191 } // namespace reference_ops
192 } // namespace tflite
193
194 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
195