1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
17 
18 #include <cmath>
19 #include <cstdint>
20 #include <limits>
21 
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/round.h"
24 #include "tensorflow/lite/kernels/internal/types.h"
25 
26 namespace tflite {
27 
28 // Given the min and max values of a float array, return
29 // reasonable quantization parameters to use for this array.
30 template <typename T>
ChooseQuantizationParams(double rmin,double rmax,bool narrow_range)31 QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
32                                             bool narrow_range) {
33   const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
34   const T qmax = std::numeric_limits<T>::max();
35   const double qmin_double = qmin;
36   const double qmax_double = qmax;
37   // 0 should always be a representable value. Let's assume that the initial
38   // min,max range contains 0.
39   TFLITE_CHECK_LE(rmin, 0.);
40   TFLITE_CHECK_GE(rmax, 0.);
41   if (rmin == rmax) {
42     // Special case where the min,max range is a point. Should be {0}.
43     TFLITE_CHECK_EQ(rmin, 0.);
44     TFLITE_CHECK_EQ(rmax, 0.);
45     QuantizationParams quantization_params;
46     quantization_params.zero_point = 0;
47     quantization_params.scale = 0.;
48     return quantization_params;
49   }
50 
51   // General case.
52   //
53   // First determine the scale.
54   const double scale = (rmax - rmin) / (qmax_double - qmin_double);
55 
56   // Zero-point computation.
57   // First the initial floating-point computation. The zero-point can be
58   // determined from solving an affine equation for any known pair
59   // (real value, corresponding quantized value).
60   // We know two such pairs: (rmin, qmin) and (rmax, qmax).
61   // The arithmetic error on the zero point computed from either pair
62   // will be roughly machine_epsilon * (sum of absolute values of terms)
63   // so we want to use the variant that adds the smaller terms.
64   const double zero_point_from_min = qmin_double - rmin / scale;
65   const double zero_point_from_max = qmax_double - rmax / scale;
66   const double zero_point_from_min_error =
67       std::abs(qmin_double) + std::abs(rmin / scale);
68   const double zero_point_from_max_error =
69       std::abs(qmax_double) + std::abs(rmax / scale);
70 
71   const double zero_point_double =
72       zero_point_from_min_error < zero_point_from_max_error
73           ? zero_point_from_min
74           : zero_point_from_max;
75 
76   // Now we need to nudge the zero point to be an integer
77   // (our zero points are integer, and this is motivated by the requirement
78   // to be able to represent the real value "0" exactly as a quantized value,
79   // which is required in multiple places, for example in Im2col with SAME
80   // padding).
81   T nudged_zero_point = 0;
82   if (zero_point_double < qmin_double) {
83     nudged_zero_point = qmin;
84   } else if (zero_point_double > qmax_double) {
85     nudged_zero_point = qmax;
86   } else {
87     nudged_zero_point = static_cast<T>(round(zero_point_double));
88   }
89   // The zero point should always be in the range of quantized value,
90   // [qmin, qmax].
91   TFLITE_CHECK_GE(nudged_zero_point, qmin);
92   TFLITE_CHECK_LE(nudged_zero_point, qmax);
93 
94   // Finally, store the result nudged quantization params.
95   QuantizationParams quantization_params;
96   quantization_params.zero_point = nudged_zero_point;
97   quantization_params.scale = scale;
98   return quantization_params;
99 }
100 
101 template <typename T>
ChooseQuantizationParams(double rmin,double rmax)102 QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
103   return ChooseQuantizationParams<T>(rmin, rmax, false);
104 }
105 
106 // Converts a floating-point number to an integer. For all inputs x where
107 // static_cast<IntOut>(x) is legal according to the C++ standard, the result
108 // is identical to that cast (i.e. the result is x with its fractional part
109 // truncated whenever that is representable as IntOut).
110 //
111 // static_cast would cause undefined behavior for the following cases, which
112 // have well-defined behavior for this function:
113 //
114 //  1. If x is NaN, the result is zero.
115 //
116 //  2. If the truncated form of x is above the representable range of IntOut,
117 //     the result is std::numeric_limits<IntOut>::max().
118 //
119 //  3. If the truncated form of x is below the representable range of IntOut,
120 //     the result is std::numeric_limits<IntOut>::min().
121 //
122 // Note that cases #2 and #3 cover infinities as well as finite numbers.
123 //
124 // The range of FloatIn must include the range of IntOut, otherwise
125 // the results are undefined.
126 // TODO(sfeuz): Replace by absl::SafeCast once available.
127 template <class IntOut, class FloatIn>
SafeCast(FloatIn x)128 IntOut SafeCast(FloatIn x) {
129   static_assert(!std::numeric_limits<FloatIn>::is_integer,
130                 "FloatIn is integer");
131   static_assert(std::numeric_limits<IntOut>::is_integer,
132                 "IntOut is not integer");
133   static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
134 
135   // Special case NaN, for which the logic below doesn't work.
136   if (std::isnan(x)) {
137     return 0;
138   }
139 
140   // Negative values all clip to zero for unsigned results.
141   if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
142     return 0;
143   }
144 
145   // Handle infinities.
146   if (std::isinf(x)) {
147     return x < 0 ? std::numeric_limits<IntOut>::min()
148                  : std::numeric_limits<IntOut>::max();
149   }
150 
151   // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
152   // unless x is zero in which case exp == 0. Note that this implies that the
153   // magnitude of x is strictly less than 2^exp.
154   int exp = 0;
155   std::frexp(x, &exp);
156 
157   // Let N be the number of non-sign bits in the representation of IntOut. If
158   // the magnitude of x is strictly less than 2^N, the truncated version of x
159   // is representable as IntOut. The only representable integer for which this
160   // is not the case is kMin for signed types (i.e. -2^N), but that is covered
161   // by the fall-through below.
162   if (exp <= std::numeric_limits<IntOut>::digits) {
163     return x;
164   }
165 
166   // Handle numbers with magnitude >= 2^N.
167   return x < 0 ? std::numeric_limits<IntOut>::min()
168                : std::numeric_limits<IntOut>::max();
169 }
170 
171 // Decompose a double multiplier into a Q0.31 int32 representation of its
172 // significand, and shift representation of NEGATIVE its exponent ---
173 // this is intended as a RIGHT-shift.
174 //
175 // Restricted to the case where the multiplier < 1 (and non-negative).
176 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
177                                          int32_t* quantized_multiplier,
178                                          int* left_shift);
179 
180 // Decompose a double multiplier into a Q0.31 int32 representation of its
181 // significand, and shift representation of its exponent.
182 //
183 // Restricted to the case where the multiplier > 1.
184 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
185                                       int32_t* quantized_multiplier,
186                                       int* left_shift);
187 
188 // Decompose a double multiplier into a Q0.31 int32 representation of its
189 // significand, and shift representation of its exponent.
190 //
191 // Handles an arbitrary positive multiplier. The 'shift' output-value is
192 // basically the 'floating-point exponent' of the multiplier:
193 // Negative for a right-shift (when the multiplier is <1), positive for a
194 // left-shift (when the multiplier is >1)
195 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
196                         int* shift);
197 
198 // Splits a double input value into a returned fraction, and a shift value from
199 // the exponent, using only bitwise and integer operations to support
200 // microcontrollers and other environments without floating-point support.
201 //
202 // This is designed to be a replacement for how std::frexp() is used within the
203 // QuantizeMultiplier() function, and so has a different signature than the
204 // standard version, returning a 64-bit integer rather than a double. This
205 // result has a maximum value of 1<<31, with the fraction expressed as a
206 // proportion of that maximum.
207 //
208 // std::frexp() returns NaNs and infinities unmodified, but since we're
209 // returning integers that can't represent those values, instead we return
210 // a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
211 // result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
212 // std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
213 // result in return values that end up truncating some bits at the end,
214 // reflecting the loss of precision inherent in denormalization.
215 int64_t IntegerFrExp(double input, int* shift);
216 
217 // Converts an integer fraction in the format produced by IntegerFrExp (where
218 // 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
219 // IEEE binary64 double format result. The implementation uses only integer and
220 // bitwise operators, so no floating point hardware support or emulation is
221 // needed. This is here so quantized operations can run non-time-critical
222 // preparation calculations on microcontrollers and other platforms without
223 // float support.
224 double DoubleFromFractionAndShift(int64_t fraction, int shift);
225 
226 // Performs a multiplication of two numbers in double format, using only integer
227 // and bitwise instructions. This is aimed at supporting housekeeping functions
228 // for quantized operations on microcontrollers without floating-point hardware.
229 double IntegerDoubleMultiply(double a, double b);
230 
231 // Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
232 // greater than b. It is implemented using only integer and logical instructions
233 // so that it can be easily run on microcontrollers for quantized operations.
234 int IntegerDoubleCompare(double a, double b);
235 
236 // This first creates a multiplier in a double equivalent of
237 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
238 // precision in the double's fractional bits.  It then splits the result into
239 // significand and exponent.
240 void PreprocessSoftmaxScaling(double beta, double input_scale,
241                               int input_integer_bits,
242                               int32_t* quantized_multiplier, int* left_shift);
243 // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
244 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
245                                     int input_integer_bits,
246                                     int32_t* quantized_multiplier,
247                                     int* left_shift,
248                                     int32_t* reverse_scaling_divisor,
249                                     int* reverse_scaling_left_shift);
250 // Calculate the largest input that will result in a within-bounds intermediate
251 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
252 // it must not overflow before we reduce the value by multiplication by the
253 // input multiplier.  The negative radius is used as the minimum difference in
254 // Softmax.
255 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
256 
257 // Nudges a min/max quantization range to ensure zero is zero.
258 // Gymnastics with nudged zero point is to ensure that real zero maps to
259 // an integer, which is required for e.g. zero-padding in convolutional layers.
260 // Outputs nudged_min, nudged_max, nudged_scale.
261 void NudgeQuantizationRange(const float min, const float max,
262                             const int quant_min, const int quant_max,
263                             float* nudged_min, float* nudged_max,
264                             float* nudged_scale);
265 
266 // Fake quantizes (quantizes and dequantizes) input_data using the scale,
267 // nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
268 // in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
269 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
270                        const float nudged_max, const float* input_data,
271                        float* output_data, const float size);
272 
273 // If x is approximately a power of two (with any positive or negative
274 // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
275 // returns false.
276 bool CheckedLog2(const float x, int* log2_result);
277 
278 // Decomposes an array of double multipliers into a Q0.31 int32 representation
279 // of its significand, and shift representation of its exponent.
280 //
281 // Handles an arbitrary multiplier. The 'shift' output-value is
282 // basically the 'floating-point exponent' of the multiplier:
283 // Negative for a right-shift (when the multiplier is <1), positive for a
284 // left-shift (when the multiplier is >1)
285 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
286                              int32_t* effective_scale_significand,
287                              int* effective_shift);
288 
289 }  // namespace tflite
290 
291 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
292