1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
17
18 #include <cmath>
19 #include <cstdint>
20 #include <limits>
21
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/cppmath.h"
24 #include "tensorflow/lite/kernels/internal/types.h"
25
26 namespace tflite {
27
28 // Given the min and max values of a float array, return
29 // reasonable quantization parameters to use for this array.
30 template <typename T>
ChooseQuantizationParams(double rmin,double rmax,bool narrow_range)31 QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
32 bool narrow_range) {
33 const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
34 const T qmax = std::numeric_limits<T>::max();
35 const double qmin_double = qmin;
36 const double qmax_double = qmax;
37 // 0 should always be a representable value. Let's assume that the initial
38 // min,max range contains 0.
39 TFLITE_CHECK_LE(rmin, 0.);
40 TFLITE_CHECK_GE(rmax, 0.);
41 if (rmin == rmax) {
42 // Special case where the min,max range is a point. Should be {0}.
43 TFLITE_CHECK_EQ(rmin, 0.);
44 TFLITE_CHECK_EQ(rmax, 0.);
45 QuantizationParams quantization_params;
46 quantization_params.zero_point = 0;
47 quantization_params.scale = 0.;
48 return quantization_params;
49 }
50
51 // General case.
52 //
53 // First determine the scale.
54 const double scale = (rmax - rmin) / (qmax_double - qmin_double);
55
56 // Zero-point computation.
57 // First the initial floating-point computation. The zero-point can be
58 // determined from solving an affine equation for any known pair
59 // (real value, corresponding quantized value).
60 // We know two such pairs: (rmin, qmin) and (rmax, qmax).
61 // The arithmetic error on the zero point computed from either pair
62 // will be roughly machine_epsilon * (sum of absolute values of terms)
63 // so we want to use the variant that adds the smaller terms.
64 const double zero_point_from_min = qmin_double - rmin / scale;
65 const double zero_point_from_max = qmax_double - rmax / scale;
66 const double zero_point_from_min_error =
67 std::abs(qmin_double) + std::abs(rmin / scale);
68 const double zero_point_from_max_error =
69 std::abs(qmax_double) + std::abs(rmax / scale);
70
71 const double zero_point_double =
72 zero_point_from_min_error < zero_point_from_max_error
73 ? zero_point_from_min
74 : zero_point_from_max;
75
76 // Now we need to nudge the zero point to be an integer
77 // (our zero points are integer, and this is motivated by the requirement
78 // to be able to represent the real value "0" exactly as a quantized value,
79 // which is required in multiple places, for example in Im2col with SAME
80 // padding).
81 T nudged_zero_point = 0;
82 if (zero_point_double < qmin_double) {
83 nudged_zero_point = qmin;
84 } else if (zero_point_double > qmax_double) {
85 nudged_zero_point = qmax;
86 } else {
87 nudged_zero_point = static_cast<T>(round(zero_point_double));
88 }
89 // The zero point should always be in the range of quantized value,
90 // [qmin, qmax].
91 TFLITE_CHECK_GE(nudged_zero_point, qmin);
92 TFLITE_CHECK_LE(nudged_zero_point, qmax);
93
94 // Finally, store the result nudged quantization params.
95 QuantizationParams quantization_params;
96 quantization_params.zero_point = nudged_zero_point;
97 quantization_params.scale = scale;
98 return quantization_params;
99 }
100
101 template <typename T>
ChooseQuantizationParams(double rmin,double rmax)102 QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
103 return ChooseQuantizationParams<T>(rmin, rmax, false);
104 }
105
106 // Converts a floating-point number to an integer. For all inputs x where
107 // static_cast<IntOut>(x) is legal according to the C++ standard, the result
108 // is identical to that cast (i.e. the result is x with its fractional part
109 // truncated whenever that is representable as IntOut).
110 //
111 // static_cast would cause undefined behavior for the following cases, which
112 // have well-defined behavior for this function:
113 //
114 // 1. If x is NaN, the result is zero.
115 //
116 // 2. If the truncated form of x is above the representable range of IntOut,
117 // the result is std::numeric_limits<IntOut>::max().
118 //
119 // 3. If the truncated form of x is below the representable range of IntOut,
120 // the result is std::numeric_limits<IntOut>::min().
121 //
122 // Note that cases #2 and #3 cover infinities as well as finite numbers.
123 //
124 // The range of FloatIn must include the range of IntOut, otherwise
125 // the results are undefined.
126 // TODO(sfeuz): Replace by absl::SafeCast once available.
127 template <class IntOut, class FloatIn>
SafeCast(FloatIn x)128 IntOut SafeCast(FloatIn x) {
129 static_assert(!std::numeric_limits<FloatIn>::is_integer,
130 "FloatIn is integer");
131 static_assert(std::numeric_limits<IntOut>::is_integer,
132 "IntOut is not integer");
133 static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
134
135 // Special case NaN, for which the logic below doesn't work.
136 if (std::isnan(x)) {
137 return 0;
138 }
139
140 // Negative values all clip to zero for unsigned results.
141 if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
142 return 0;
143 }
144
145 // Handle infinities.
146 if (std::isinf(x)) {
147 return x < 0 ? std::numeric_limits<IntOut>::min()
148 : std::numeric_limits<IntOut>::max();
149 }
150
151 // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
152 // unless x is zero in which case exp == 0. Note that this implies that the
153 // magnitude of x is strictly less than 2^exp.
154 int exp = 0;
155 std::frexp(x, &exp);
156
157 // Let N be the number of non-sign bits in the representation of IntOut. If
158 // the magnitude of x is strictly less than 2^N, the truncated version of x
159 // is representable as IntOut. The only representable integer for which this
160 // is not the case is kMin for signed types (i.e. -2^N), but that is covered
161 // by the fall-through below.
162 if (exp <= std::numeric_limits<IntOut>::digits) {
163 return x;
164 }
165
166 // Handle numbers with magnitude >= 2^N.
167 return x < 0 ? std::numeric_limits<IntOut>::min()
168 : std::numeric_limits<IntOut>::max();
169 }
170
171 // Decompose a double multiplier into a Q0.31 int32 representation of its
172 // significand, and shift representation of NEGATIVE its exponent ---
173 // this is intended as a RIGHT-shift.
174 //
175 // Restricted to the case where the multiplier < 1 (and non-negative).
176 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
177 int32_t* quantized_multiplier,
178 int* left_shift);
179
180 // Decompose a double multiplier into a Q0.31 int32 representation of its
181 // significand, and shift representation of its exponent.
182 //
183 // Restricted to the case where the multiplier > 1.
184 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
185 int32_t* quantized_multiplier,
186 int* left_shift);
187
188 // Decompose a double multiplier into a Q0.31 int32 representation of its
189 // significand, and shift representation of its exponent.
190 //
191 // Handles an arbitrary positive multiplier. The 'shift' output-value is
192 // basically the 'floating-point exponent' of the multiplier:
193 // Negative for a right-shift (when the multiplier is <1), positive for a
194 // left-shift (when the multiplier is >1)
195 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
196 int* shift);
197
198 // Splits a double input value into a returned fraction, and a shift value from
199 // the exponent, using only bitwise and integer operations to support
200 // microcontrollers and other environments without floating-point support.
201 //
202 // This is designed to be a replacement for how std::frexp() is used within the
203 // QuantizeMultiplier() function, and so has a different signature than the
204 // standard version, returning a 64-bit integer rather than a double. This
205 // result has a maximum value of 1<<31, with the fraction expressed as a
206 // proportion of that maximum.
207 //
208 // std::frexp() returns NaNs and infinities unmodified, but since we're
209 // returning integers that can't represent those values, instead we return
210 // a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
211 // result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
212 // std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
213 // result in return values that end up truncating some bits at the end,
214 // reflecting the loss of precision inherent in denormalization.
215 int64_t IntegerFrExp(double input, int* shift);
216
217 // Converts an integer fraction in the format produced by IntegerFrExp (where
218 // 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
219 // IEEE binary64 double format result. The implementation uses only integer and
220 // bitwise operators, so no floating point hardware support or emulation is
221 // needed. This is here so quantized operations can run non-time-critical
222 // preparation calculations on microcontrollers and other platforms without
223 // float support.
224 double DoubleFromFractionAndShift(int64_t fraction, int shift);
225
226 // Performs a multiplication of two numbers in double format, using only integer
227 // and bitwise instructions. This is aimed at supporting housekeeping functions
228 // for quantized operations on microcontrollers without floating-point hardware.
229 double IntegerDoubleMultiply(double a, double b);
230
231 // Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
232 // greater than b. It is implemented using only integer and logical instructions
233 // so that it can be easily run on microcontrollers for quantized operations.
234 int IntegerDoubleCompare(double a, double b);
235
236 // This first creates a multiplier in a double equivalent of
237 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
238 // precision in the double's fractional bits. It then splits the result into
239 // significand and exponent.
240 void PreprocessSoftmaxScaling(double beta, double input_scale,
241 int input_integer_bits,
242 int32_t* quantized_multiplier, int* left_shift);
243 // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
244 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
245 int input_integer_bits,
246 int32_t* quantized_multiplier,
247 int* left_shift,
248 int32_t* reverse_scaling_divisor,
249 int* reverse_scaling_left_shift);
250 // Calculate the largest input that will result in a within-bounds intermediate
251 // result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words,
252 // it must not overflow before we reduce the value by multiplication by the
253 // input multiplier. The negative radius is used as the minimum difference in
254 // Softmax.
255 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
256 int total_signed_bits = 31);
257
258 // Nudges a min/max quantization range to ensure zero is zero.
259 // Gymnastics with nudged zero point is to ensure that real zero maps to
260 // an integer, which is required for e.g. zero-padding in convolutional layers.
261 // Outputs nudged_min, nudged_max, nudged_scale.
262 void NudgeQuantizationRange(const float min, const float max,
263 const int quant_min, const int quant_max,
264 float* nudged_min, float* nudged_max,
265 float* nudged_scale);
266
267 // Fake quantizes (quantizes and dequantizes) input_data using the scale,
268 // nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
269 // in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
270 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
271 const float nudged_max, const float* input_data,
272 float* output_data, const float size);
273
274 // If x is approximately a power of two (with any positive or negative
275 // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
276 // returns false.
277 bool CheckedLog2(const float x, int* log2_result);
278
279 // Decomposes an array of double multipliers into a Q0.31 int32 representation
280 // of its significand, and shift representation of its exponent.
281 //
282 // Handles an arbitrary multiplier. The 'shift' output-value is
283 // basically the 'floating-point exponent' of the multiplier:
284 // Negative for a right-shift (when the multiplier is <1), positive for a
285 // left-shift (when the multiplier is >1)
286 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
287 int32_t* effective_scale_significand,
288 int* effective_shift);
289
290 } // namespace tflite
291
292 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
293