1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // output_stages.h: public definitions of the output stages that can
16 // be assembled into an output pipeline, to control how internal
17 // 32-bit accumulators are transformed to obtain the final uint8
18 // result matrix entries.
19 
20 #ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
21 #define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
22 
23 #include <tuple>
24 
25 #include "../internal/common.h"
26 
27 namespace gemmlowp {
28 
29 // This output stage takes int32 values and returns still int32 values,
30 // but "quantized down" to the uint8 scale; in other words, its output
31 // is typically what one would then clamp to [0..255] and cast to uint8
32 // (see OutputStageSaturatingCastToUint8).
33 //
34 // This "quantization down" process depends on 3 parameters,
35 //   result_offset, result_mult_int, result_shift,
36 // and the result is:
37 //   ((input + result_offset) * result_mult_int + rounding) >> result_shift
38 // where
39 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
40 struct OutputStageQuantizeDownInt32ToUint8Scale {
41   std::int32_t result_offset;
42   std::int32_t result_mult_int;
43   std::int32_t result_shift;
44 };
45 
46 // This output stage takes int32 values and returns still int32 values,
47 // but "quantized down" to the uint8 scale; in other words, its output
48 // is typically what one would then clamp to [0..255] and cast to uint8
49 // (see OutputStageSaturatingCastToUint8).
50 //
51 // This "quantization down" process depends on 3 parameters,
52 //   result_offset, result_mult_int, result_shift,
53 // and the result is:
54 //   ((input + result_offset) * result_mult_int + rounding) >> result_shift
55 // where
56 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
57 //
58 // Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each
59 // row or column of the output (depending on tShape) has its own result_offset
60 // and result_mult_int numbers.
61 template <VectorShape tShape>
62 struct OutputStageQuantizeDownInt32ToUint8ScalePC {
63   VectorMap<const std::int32_t, tShape> result_offset;
64   VectorMap<const std::int32_t, tShape> result_mult_int;
65   std::int32_t result_shift;
66 };
67 
68 // This output stage takes int32 values and returns still int32 values,
69 // but "quantized down" to a difference scale; for example, in a pipeline
70 // that outputs uint8 values in [0..255], the output of this stage would be
71 // int32 values ready to be clamped to [0..255] and casted to uint8
72 // (see OutputStageSaturatingCastToUint8).
73 //
74 // This "quantization down" process depends on 3 parameters,
75 //   result_offset, result_fixedpoint_multiplier, result_shift,
76 // and the result is:
77 //   ((FixedPointMul(input, result_fixedpoint_multiplier) +
78 //   rounding) >> result_shift) + result_offset_after_shift
79 // where
80 //   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
81 // and where FixedPointMul(x, y) is the nearest integer to the following
82 // mathematical expression, evaluated without overflow or intermediate
83 // rounding:
84 //   (x * y) / 2^31
85 // In practice, it is expected that FixedPointMul will be implemented
86 // using hardware "rounding doubling int32 multiply high" instructions,
87 // such as VQRDMULH on ARM. See in fixedpoint.h the generic function,
88 // SaturatingRoundingDoublingHighMul.
89 //
90 // Notice that the other difference from
91 // OutputStageQuantizeDownInt32ToUint8Scale is that the result offset
92 // is applied after the multiplier and shift, not before. This ensures
93 // that no matter what the multiplier and shift are, the result offset
94 // is effectively integral: offsetting the final result by an integer.
95 // The motivation for this is to faithfully support quantization schemes
96 // where the formula linking quantized values to the real mathematical
97 // values that they represent, is of the form
98 //
99 //   real_value = scale * (quantized_value - zero_point)
100 //
101 // where scale is a real number (represented in quantized form by
102 // result_fixedpoint_multiplier and result_shift) and zero_point
103 // is an integer telling which quantized value correspond to the
104 // real value 0, and is represented here by (the opposite of)
105 // result_offset_after_shift.
106 // The motivation for such a quantization scheme, designed to
107 // ensure that 0 is always a representable value, is that in
108 // many applications, we need to 0-pad arrays and that can only be
109 // done for quantized arrays if 0 is a representable value in
110 // quantized form. In particular, convolution-like operations
111 // are often implemented using 0-padding, or "im2col"-like
112 // expansions that implicitly rely on 0-padding. If 0 were not
113 // a representable value, such operations would have to pad
114 // using a nonzero value, introducing bias in the computation.
115 struct OutputStageQuantizeDownInt32ByFixedPoint {
116   std::int32_t result_fixedpoint_multiplier;
117   std::int32_t result_shift;
118   std::int32_t result_offset_after_shift;
119 };
120 
121 // OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint is the old deprecated
122 // name of OutputStageQuantizeDownInt32ByFixedPoint, before we noticed that
123 // there really wasn't anything Uint8-specific about it.
124 using OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint = OutputStageQuantizeDownInt32ByFixedPoint;
125 
126 // Variant of OutputStageQuantizeDownInt32ByFixedPoint where the 'shift'
127 // is not necessarily just a right shift, so we can represent multipliers
128 // greater than 1. This takes an result_exponent parameter; when it's
129 // <= 0, this is equivalent to OutputStageQuantizeDownInt32ByFixedPoint
130 // with result_shift = -result_exponent.
131 // In the general case, this consists in first left-shifting by
132 // std::max(result_exponent, 0), before doing the same as
133 // OutputStageQuantizeDownInt32ByFixedPoint with
134 // result_shift = std::max(-result_exponent, 0).
135 struct OutputStageScaleInt32ByFixedPointAndExponent {
136   std::int32_t result_fixedpoint_multiplier;
137   std::int32_t result_exponent;
138   std::int32_t result_offset_after_shift;
139 };
140 
141 // Variant of OutputStageQuantizeDownInt32ByFixedPoint where the 'shift'
142 // is not necessarily just a right shift, so we can represent multipliers
143 // greater than 1. This takes an result_exponent parameter; when it's
144 // <= 0, this is equivalent to OutputStageQuantizeDownInt32ByFixedPoint
145 // with result_shift = -result_exponent.
146 // In the general case, this consists in first left-shifting by
147 // std::max(result_exponent, 0), before doing the same as
148 // OutputStageQuantizeDownInt32ByFixedPoint with
149 // result_shift = std::max(-result_exponent, 0).
150 //
151 // Difference from OutputStageScaleInt32ByFixedPointAndExponent here is that
152 // each row or column of the output (depending on tShape) has its own
153 // result_fixedpoint_multiplier and result_exponent numbers.
154 template <VectorShape tShape>
155 struct OutputStageScaleInt32ByFixedPointAndExponentPC {
156   VectorMap<const std::int32_t, tShape> result_fixedpoint_multiplier;
157   VectorMap<const std::int32_t, tShape> result_exponent;
158   std::int32_t result_offset_after_shift;
159 };
160 
161 // This output stage takes int32 values that are expected to be already
162 // on the final uint8 scale, but not necessarily in the [0..255] range.
163 // It clamps them to the [0..255] range and returns them casted to uint8.
164 struct OutputStageSaturatingCastToUint8 {};
165 
166 // This output stage takes int32 values that are expected to be already
167 // on the final int8 scale, but not necessarily in the [-128..127] range.
168 // It clamps them to the [-128..127] range and returns them casted to int8.
169 struct OutputStageSaturatingCastToInt8 {};
170 
171 // This output stage takes int32 values that are expected to be already
172 // in the [0..255] range and returns them casted to uint8.
173 // This stage can save time if used instead of the
174 // OutputStageSaturatingCastToUint8 stage immediately after the
175 // OutputStageClamp stage.
176 struct OutputStageTruncatingCastToUint8 {};
177 
178 // This output stage takes int32 values that are expected to be already
179 // on the final int16 scale, but not necessarily in the [-32768..32767] range.
180 // It clamps them to the [-32768..32767] range and returns them casted to int16.
181 struct OutputStageSaturatingCastToInt16 {};
182 
183 // This output stage depends on a "bias vector" that should contain int32
184 // entries, and be either a row-vector of the same number of columns as the
185 // result matrix, or a column-vector of the same number of rows as the
186 // result matrix. This output stage takes int32 values and adds to them
187 // the corresponding entry of the bias vector (broadcasted in the other
188 // direction to fit the matrix's shape), outputting int32 values.
189 template <typename VectorType>
190 struct OutputStageBiasAddition {
191   VectorType bias_vector;
192 };
193 
194 // This output stage clamps value between the specified min and max bounds.
195 // It can be used to implement "rectified linear unit" activation functions
196 // in neural networks.
197 struct OutputStageClamp {
198   std::int32_t min;
199   std::int32_t max;
200 };
201 
202 struct OutputStageTanh {
203   std::int32_t real_zero_as_int32;
204   std::int32_t real_amplitude_as_int32;
205 };
206 
207 // An output pipeline is just a std::tuple of output stages.
208 // This function generates a standard output pipeline consisting of two stages:
209 // OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8.
210 inline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale,
211                   OutputStageSaturatingCastToUint8>
MakeStandardOutputPipeline(std::int32_t result_offset,std::int32_t result_mult_int,std::int32_t result_shift)212 MakeStandardOutputPipeline(std::int32_t result_offset,
213                            std::int32_t result_mult_int,
214                            std::int32_t result_shift) {
215   OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage;
216   quantize_down_stage.result_offset = result_offset;
217   quantize_down_stage.result_mult_int = result_mult_int;
218   quantize_down_stage.result_shift = result_shift;
219   OutputStageSaturatingCastToUint8 saturating_cast_stage;
220   return std::make_tuple(quantize_down_stage, saturating_cast_stage);
221 }
222 
223 // An output pipeline is just a std::tuple of output stages.
224 // This function generates a standard output pipeline consisting of two stages:
225 // OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8.
226 template <VectorShape tShape>
227 inline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>,
228                   OutputStageSaturatingCastToUint8>
MakeStandardOutputPipeline(const VectorMap<const std::int32_t,tShape> & result_offset,const VectorMap<const std::int32_t,tShape> & result_mult_int,std::int32_t result_shift)229 MakeStandardOutputPipeline(
230     const VectorMap<const std::int32_t, tShape>& result_offset,
231     const VectorMap<const std::int32_t, tShape>& result_mult_int,
232     std::int32_t result_shift) {
233   OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage;
234   quantize_down_stage.result_offset = result_offset;
235   quantize_down_stage.result_mult_int = result_mult_int;
236   quantize_down_stage.result_shift = result_shift;
237   OutputStageSaturatingCastToUint8 saturating_cast_stage;
238   return std::make_tuple(quantize_down_stage, saturating_cast_stage);
239 }
240 
241 }  // namespace gemmlowp
242 
243 #endif  // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
244