1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // output.h: processing the 32-bit accumulators output by the unpack
16 // stage, obtaining the final result matrix entries and storing them into
17 // the destination matrix.
18 
19 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
20 #define GEMMLOWP_INTERNAL_OUTPUT_H_
21 
22 #include <cmath>
23 #include <tuple>
24 #include <type_traits>
25 
26 #include "../public/output_stages.h"
27 #include "fixedpoint.h"
28 
29 namespace gemmlowp {
30 
31 // A Fragment is a small fixed-size matrix typically stored in one or
32 // a few architecture-specific SIMD vectors. Besides plain old scalar types
33 // such as int32_t, Fragment types are what can be used as input/output data
34 // types for output pipeline stages.
35 //
36 // More details:
37 //
38 // In the generic scalar code in this file, we have only implemented
39 // evaluation of output stages for scalar inputs (e.g. plain int32_t values).
40 // Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
41 // evaluation of output stages for SIMD vector types. However, this raises
42 // the question of how the different values ("lanes") in a SIMD vector
43 // correspond to different values in the whole matrices. For simple entry-wise
44 // output stages, this doesn't matter, but for other output stages depending
45 // on position within the whole matrix, this does matter. To solve this
46 // problem, rather than implementing evaluation of output stages for raw
47 // SIMD vector types, we wrap SIMD vector types in "fragment" structs that
48 // bring the additional structure of "shape" i.e. mapping SIMD lanes to
49 // matrix entries, and we specialize evaluation of output stage for such
50 // fragment types. The Fragment template struct here is how we generate
51 // all fragment structs. For example, in output_neon.h, it may be specialized
52 // with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
53 // vector shapes. While Fragment is only used for SIMD paths, we leave it
54 // here in this platform-generic file because this same template should
55 // cover the needs of any SIMD architectures.
56 template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
57 struct Fragment {
58   typedef tDataType DataType;
59   static const int kRows = tRows;
60   static const int kCols = tCols;
61   static const MapOrder kOrder = tOrder;
62 
FragmentFragment63   Fragment() {}
FragmentFragment64   Fragment(const DataType& d) : data(d) {}
DataTypeFragment65   operator DataType() const { return data; }
66 
67   DataType data;
68 };
69 
70 typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
71 typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;
72 
73 // OutputStageEvalImpl is the template that we specialize to provide
74 // implementations of each output stage for each type of input data.
75 //
76 // Each specialization provides a OutputType typedef and an Eval function
77 // returning OutputType. The OutputType typically depends on the InputType.
78 //
79 // There are two dimensions in which input data types can vary:
80 //   1. Different output stages may expect different data types. The
81 //      only hard constraint is that the first stage accepts int32, as
82 //      the unpack stage produces int32 accumulators.
83 //   2. For a given scalar data type such as int32, there is still the
84 //      possibility of having SIMD vector types such as NEON int32x4_t,
85 //      typically wrapped as "fragment" types, see struct Fragment.
86 //      Thus, there can be several OutputStageEvalImpl
87 //      specializations for a single OutputStageType, for different
88 //      InputType's.
89 template <typename OutputStageType, typename InputType>
90 struct OutputStageEvalImpl {
91   // This generic template body should never be hit.
92   static_assert(
93       std::is_same<InputType, void>::value,
94       "Unimplemented: missing implementation of this output pipeline stage "
95       "for this data type. This would happen if some architecture-specific "
96       "SIMD back-end (output_$arch.h) were incomplete.");
97 
OutputStageEvalImplOutputStageEvalImpl98   OutputStageEvalImpl(const OutputStageType&) {}
99 };
100 
101 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
102 template <>
103 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
104                            FragmentInt32x1x1> {
105   typedef FragmentInt32x1x1 InputType;
106   typedef FragmentInt32x1x1 OutputType;
107   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
108 
109   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
110 
111   OutputType Eval(InputType input, int, int) const {
112     const std::int32_t result_shift = output_stage.result_shift;
113     const std::int32_t result_mult_int = output_stage.result_mult_int;
114     const std::int32_t result_offset = output_stage.result_offset;
115     const std::int32_t kRoundingTerm =
116         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
117     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
118            result_shift;
119   }
120 
121   const OutputStage& output_stage;
122 };
123 
124 template <>
125 struct OutputStageEvalImpl<
126     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
127     FragmentInt32x1x1> {
128   typedef FragmentInt32x1x1 InputType;
129   typedef FragmentInt32x1x1 OutputType;
130   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
131       OutputStage;
132 
133   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
134 
135   OutputType Eval(InputType input, int row, int col) const {
136     const std::int32_t result_shift = output_stage.result_shift;
137     const std::int32_t result_mult_int = output_stage.result_mult_int(row);
138     const std::int32_t result_offset = output_stage.result_offset(row);
139     const std::int32_t kRoundingTerm =
140         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
141     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
142            result_shift;
143   }
144 
145   const OutputStage& output_stage;
146 };
147 
148 template <>
149 struct OutputStageEvalImpl<
150     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
151     FragmentInt32x1x1> {
152   typedef FragmentInt32x1x1 InputType;
153   typedef FragmentInt32x1x1 OutputType;
154   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
155       OutputStage;
156 
157   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
158 
159   OutputType Eval(InputType input, int row, int col) const {
160     const std::int32_t result_shift = output_stage.result_shift;
161     const std::int32_t result_mult_int = output_stage.result_mult_int(col);
162     const std::int32_t result_offset = output_stage.result_offset(col);
163     const std::int32_t kRoundingTerm =
164         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
165     return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
166            result_shift;
167   }
168 
169   const OutputStage& output_stage;
170 };
171 
172 // Implementation of OutputStageSaturatingCastToUint8 for scalar data
173 template <>
174 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
175                            FragmentInt32x1x1> {
176   typedef FragmentInt32x1x1 InputType;
177   typedef FragmentUint8x1x1 OutputType;
178   typedef OutputStageSaturatingCastToUint8 OutputStage;
179 
180   OutputStageEvalImpl(const OutputStage&) {}
181 
182   OutputType Eval(InputType input, int, int) const {
183     std::int32_t data = input.data;
184     return data > 255 ? 255 : data < 0 ? 0 : data;
185   }
186 };
187 
188 // Implementation of OutputStageBiasAddition for scalar data
189 template <typename VectorType>
190 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
191                            FragmentInt32x1x1> {
192   typedef FragmentInt32x1x1 InputType;
193   typedef FragmentInt32x1x1 OutputType;
194   typedef OutputStageBiasAddition<VectorType> OutputStage;
195 
196   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
197 
198   OutputType Eval(InputType input, int row, int col) const {
199     if (VectorType::kShape == VectorShape::Row) {
200       return input + output_stage.bias_vector(col);
201     } else {
202       return input + output_stage.bias_vector(row);
203     }
204   }
205 
206   const OutputStage& output_stage;
207 };
208 
209 // Implementation of OutputStageClamp for scalar data
210 template <>
211 struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
212   typedef FragmentInt32x1x1 InputType;
213   typedef FragmentInt32x1x1 OutputType;
214   typedef OutputStageClamp OutputStage;
215 
216   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
217 
218   OutputType Eval(InputType input, int, int) const {
219     const std::int32_t min = output_stage.min;
220     const std::int32_t max = output_stage.max;
221     return std::min(std::max(input.data, min), max);
222   }
223 
224   const OutputStage& output_stage;
225 };
226 
227 // Implementation of OutputStageTanh for either scalar or SIMD data
228 template <typename tInputType>
229 struct OutputStageTanhEvalImpl {
230   typedef tInputType InputType;
231   typedef InputType OutputType;
232   typedef typename InputType::DataType DataType;
233   typedef OutputStageTanh OutputStage;
234 
235   OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
236     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
237     const std::int32_t real_amplitude_as_int32 =
238         output_stage.real_amplitude_as_int32;
239 
240     input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
241     input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
242     output_min = real_zero_as_int32 - real_amplitude_as_int32;
243     output_max = real_zero_as_int32 + real_amplitude_as_int32;
244 
245     double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
246     inverse_amplitude_neg_exponent = 0;
247     while (inverse_amplitude_normalized_double < 0.5) {
248       inverse_amplitude_normalized_double *= 2;
249       inverse_amplitude_neg_exponent++;
250     }
251     inverse_amplitude_normalized =
252         ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);
253 
254     double amplitude_normalized_double = real_amplitude_as_int32;
255     amplitude_exponent = 0;
256     while (amplitude_normalized_double >= 1.0) {
257       amplitude_normalized_double *= 0.5;
258       amplitude_exponent++;
259     }
260     amplitude_normalized =
261         ToFixedPoint<DataType, 0>(amplitude_normalized_double);
262   }
263 
264   OutputType Eval(InputType input, int, int) const {
265     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
266 
267     typedef FixedPoint<DataType, 3> F3;
268     typedef FixedPoint<DataType, 0> F0;
269 
270     // fixed-point affine transformation
271     DataType input_centered =
272         Sub(input.data, Dup<DataType>(real_zero_as_int32));
273     F3 fixedpoint_input =
274         F3::FromRaw(input_centered) * inverse_amplitude_normalized;
275     // left shift
276     fixedpoint_input.raw() =
277         ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
278     // fixed-point tanh and multiplication
279     F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
280     // right shift
281     DataType int32_output =
282         Add(Dup<DataType>(real_zero_as_int32),
283             ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
284 
285     DataType mask_if_below_cutoff_min =
286         MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
287     DataType mask_if_above_cutoff_max =
288         MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));
289 
290     return SelectUsingMask(
291         mask_if_below_cutoff_min, Dup<DataType>(output_min),
292         SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
293                         int32_output));
294   }
295 
296   const OutputStage& output_stage;
297   std::int32_t input_cutoff_min, input_cutoff_max;
298   std::int32_t output_min, output_max;
299   FixedPoint<DataType, 0> inverse_amplitude_normalized;
300   int inverse_amplitude_neg_exponent;
301   FixedPoint<DataType, 0> amplitude_normalized;
302   int amplitude_exponent;
303 };
304 
305 template <>
306 struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
307     : OutputStageTanhEvalImpl<FragmentInt32x1x1> {
308   OutputStageEvalImpl(const OutputStageTanh& output_stage)
309       : OutputStageTanhEvalImpl(output_stage) {}
310 };
311 
312 // OutputPipelineOutputType is a helper to determine the output data type of a
313 // pipeline, for a
314 // given input data type. It is a recursive template; see the explanation on
315 // OutputPipelineEvalImpl below.
316 template <typename OutputPipelineType, int FirstStage, typename InputType,
317           bool StopRecursion =
318               FirstStage == std::tuple_size<OutputPipelineType>::value>
319 struct OutputPipelineOutputType {
320   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
321       FirstStageType;
322   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
323       FirstStageOutputType;
324   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
325                                             FirstStageOutputType>::Type Type;
326 };
327 
328 template <typename OutputPipelineType, int FirstStage, typename InputType>
329 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
330                                 true> {
331   typedef InputType Type;
332 };
333 
334 // OutputPipelineEvalImpl is a helper to implement the evaluation of
335 // the whole pipeline. It is a recursive template to implement compile-time
336 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
337 // is how we implement recursion: each specialization implements only
338 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
339 // helper to implement the termination of the recursion as a partial
340 // specialization below.
341 template <typename OutputPipelineType, int FirstStage, typename InputType,
342           bool StopRecursion =
343               FirstStage == std::tuple_size<OutputPipelineType>::value>
344 struct OutputPipelineEvalImpl {
345   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
346       FirstStageType;
347   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
348       FirstStageOutputType;
349   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
350                                             InputType>::Type OutputType;
351 
352   OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
353       : head_impl(std::get<FirstStage>(output_pipeline)),
354         tail_impl(output_pipeline) {}
355 
356   OutputType Eval(InputType input, int row, int col) const {
357     // Evaluate the first stage.
358     FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
359     // Recurse into the remaining stages.
360     return tail_impl.Eval(first_stage_output, row, col);
361   }
362 
363   const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
364   const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
365                                FirstStageOutputType>
366       tail_impl;
367 };
368 
369 // Specialization on 'StopRecursion' for terminating the recursion.
370 template <typename OutputPipelineType, int FirstStage, typename InputType>
371 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
372   OutputPipelineEvalImpl(const OutputPipelineType&) {}
373 
374   InputType Eval(InputType input, int, int) const {
375     // Terminating the recursion.
376     return input;
377   }
378 };
379 
380 // StoreFinalOutput takes the final value at the end of the output pipeline and
381 // stores it into the destination matrix. It can be specialized for different
382 // data types; the generic implementation here is typically used only for plain
383 // old scalar (not SIMD) types.
384 template <typename OutputType, typename DstType>
385 void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
386   *dst->data(row, col) = value;
387 }
388 
389 template <typename OutputPipelineType, typename InputType>
390 struct OutputPipelineExecutor {
391   OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
392       : output_pipeline_eval_impl_(output_pipeline) {}
393 
394   // RunOutputPipeline is the entry point into the output pipeline evaluation
395   // code. It should be the only thing that unpack code calls. It takes the
396   // result
397   // of the unpack stage and stores it into the destination matrix.
398   template <typename DstType>
399   void Execute(InputType input, DstType* dst, int row, int col) {
400     // Statically assert that the output pipeline matches the given destination
401     // matrix's scalar type.
402     typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
403                                               FragmentInt32x1x1>::Type::DataType
404         ScalarOutputType;
405     typedef typename DstType::Scalar ScalarDstType;
406     static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
407                   "mismatched destination scalar type and output pipeline");
408 
409     // Evaluate the output pipeline.
410     auto output = output_pipeline_eval_impl_.Eval(input, row, col);
411     // Store the result into the destination matrix.
412     StoreFinalOutput(output, dst, row, col);
413   }
414 
415   const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
416       output_pipeline_eval_impl_;
417 };
418 
419 }  // namespace gemmlowp
420 
421 #endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
422