1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
17
18 #include "ruy/profiler/instrumentation.h" // from @ruy
19 #include "tensorflow/lite/kernels/internal/common.h"
20 #include "tensorflow/lite/kernels/internal/types.h"
21
22 namespace tflite {
23 namespace reference_ops {
24
SaturatingLeftShift(int16_t value,int amount)25 inline int16_t SaturatingLeftShift(int16_t value, int amount) {
26 int32_t result = static_cast<int32_t>(value) * (1 << amount);
27 result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
28 result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
29 return result;
30 }
31
32 // Similar to ARM instruction SQDMULH.
33 // Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
34 // rounding to zero instead of to nearest (SQRDMULH).
SaturatingDoublingHighMul(std::int16_t a,std::int16_t b)35 inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
36 bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
37 std::int32_t a_32(a);
38 std::int32_t b_32(b);
39 std::int32_t ab_32 = a_32 * b_32;
40 std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
41 return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
42 }
43
44 template <typename T>
HardSwish(const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)45 inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
46 const RuntimeShape& output_shape, T* output_data) {
47 ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
48 auto matching_size = MatchingFlatSize(input_shape, output_shape);
49 const T* in_end = input_data + matching_size;
50 for (; input_data < in_end; input_data++, output_data++) {
51 const float in = *input_data;
52 *output_data =
53 in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
54 6;
55 }
56 }
57
58 template <typename T>
HardSwish(const HardSwishParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)59 inline void HardSwish(const HardSwishParams& params,
60 const RuntimeShape& input_shape, const T* input_data,
61 const RuntimeShape& output_shape, T* output_data) {
62 ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
63
64 const int flat_size = MatchingFlatSize(input_shape, output_shape);
65
66 for (int i = 0; i < flat_size; i++) {
67 const int16_t input_value = input_data[i] - params.input_zero_point;
68 // Left-shift as much as we can without overflow/saturation to put
69 // significant bits in the high bits of our 16-bit fixedpoint values, so
70 // that fixed-point approximate computations below are as accurate as
71 // possible.
72 const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
73 // Compute the input value on essentially the output scale, just not
74 // right-shifted yet. This is the value that we'll use in the (x >= +3)
75 // case, and that in the general case we'll multiply against the "relu-ish"
76 // fixed-point multiplier in [0, 1].
77 const int16_t input_value_on_preshift_output_scale =
78 gemmlowp::SaturatingRoundingDoublingHighMul(
79 input_value_on_hires_input_scale,
80 params.output_multiplier_fixedpoint_int16);
81 // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
82 // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
83 // case, it is just that plus saturation at the boundaries of [-3, 3].
84 // First, we rescale from [-3, 3] to [-1, 1], saturating.
85 // That is done by rescaling the input value with a fixed-point multiplier
86 // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
87 // that input value on the scale where the real value 3.0f is represented
88 // by the quantized value 32768. (+32768 is actually not representable as
89 // int16_t, so this saturates at +32767, and that is seen empirically to be
90 // a negligible contribution to numerical error/bias).
91 //
92 // This code is careful to correctly implement any magnitude of multiplier,
93 // involving either a right shift or a left shift, with correct saturation
94 // behavior in the left-shift case. This forces this code to be more
95 // complicated, but is necessary for real applications: a partially
96 // trained quantized MobileNet v3-small model that motivated this code
97 // exhibits some large [min, max] range boundaries, of the order of
98 // magnitude of 10 or 100 depending on layers.
99 //
100 // The next few lines are basically just an ordinary
101 // MultiplyByQuantizedMultiplier, except that we are more careful here
102 // about the fine details of saturation when left-shifting, because here
103 // overflow in left-shift is a common case, not an anomaly as
104 // MultiplyByQuantizedMultiplier assumes.
105 int16_t reluish_value = input_value_on_hires_input_scale;
106 // Shift left, saturating, as much as we can while ensuring that this
107 // saturation will not contribute to the result. That is, left shift amount
108 // reduced by 1.
109 if (params.reluish_multiplier_exponent > 0) {
110 reluish_value = SaturatingLeftShift(
111 reluish_value, params.reluish_multiplier_exponent - 1);
112 }
113 // Apply the fixed-point multiplier, dividing the value by a divisor
114 // ranging in [1, 2].
115 reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
116 reluish_value, params.reluish_multiplier_fixedpoint_int16);
117 // Apply the last bit of left-shift. Thus, in the left-shifting case, if
118 // any saturation affects the result, it is happening here --- any
119 // saturation having occurred above is overwritten here, not affecting the
120 // result.
121 if (params.reluish_multiplier_exponent > 0) {
122 reluish_value = SaturatingLeftShift(reluish_value, 1);
123 }
124 // Shift right, in the right-shifting case.
125 if (params.reluish_multiplier_exponent < 0) {
126 reluish_value = gemmlowp::RoundingDivideByPOT(
127 reluish_value, -params.reluish_multiplier_exponent);
128 }
129 // At this point we have rescaled the value into a 16bit fixedpoint
130 // reluish_value in [-1, 1].
131 // We now convert that to a 16bit fixedpoint value in [0, 1].
132 reluish_value = (reluish_value + (1 << 15)) >> 1;
133 // Use of SaturatingDoublingHighMul here is important to cancel the biases
134 // from the above SaturatingRoundingDoublingHighMul.
135 //
136 // On a partially trained MobileNet-v3-small,
137 //
138 // | bias on | ImageNet
139 // | quantized | Top-1
140 // Operation used here | values | accuracy (50k)
141 // --------------------------------------+------------+-----------
142 // SaturatingDoublingHighMul | -0.0024 | 58.920
143 // SaturatingRoundingDoublingHighMul | -0.0067 | 58.064
144 //
145 // In activations_test, this is covered by this testcase:
146 // QuantizedActivationsOpTest.HardSwishBias
147 //
148 const int16_t preshift_output_value = SaturatingDoublingHighMul(
149 reluish_value, input_value_on_preshift_output_scale);
150 // We were so far operating on the pre-shift output scale. Now we finally
151 // apply that output shift, arriving at the final output scale.
152 int16_t output_value = gemmlowp::RoundingDivideByPOT(
153 preshift_output_value, -params.output_multiplier_exponent);
154 output_value += params.output_zero_point;
155 output_value =
156 std::min<int16_t>(output_value, std::numeric_limits<T>::max());
157 output_value =
158 std::max<int16_t>(output_value, std::numeric_limits<T>::min());
159 output_data[i] = output_value;
160 }
161 }
162
163 } // namespace reference_ops
164 } // namespace tflite
165
166 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
167