1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // output.h: processing the 32-bit accumulators output by the unpack
16 // stage, obtaining the final result matrix entries and storing them into
17 // the destination matrix.
18 
19 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
20 #define GEMMLOWP_INTERNAL_OUTPUT_H_
21 
22 #include <cmath>
23 #include <tuple>
24 #include <type_traits>
25 
26 #include "../fixedpoint/fixedpoint.h"
27 #include "../public/output_stages.h"
28 #include "simd_wrappers.h"
29 
30 namespace gemmlowp {
31 
32 template <typename OutputStage, typename InputBufferType>
33 struct OutputStageEvalBufferImpl {
34   // This generic template body should never be hit.
35   static_assert(
36       std::is_same<InputBufferType, void>::value,
37       "Unimplemented: missing implementation of this output pipeline stage "
38       "for this data type. This would happen if some architecture-specific "
39       "SIMD back-end (output_$arch.h) were incomplete.");
40 };
41 
42 template <typename OutputStage, typename InputType>
43 struct OutputStageEvalImpl {
44   static constexpr int kRows = InputType::kRows;
45   static constexpr int kCols = InputType::kCols;
46   using InputBufferType = typename InputType::BufferType;
47   using BufferEvalImplType =
48       OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
49   using OutputBufferType = typename BufferEvalImplType::OutputType;
50   using OutputScalarType = typename OutputBufferType::ScalarType;
51   using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;
52 
OutputStageEvalImplOutputStageEvalImpl53   OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}
54 
EvalOutputStageEvalImpl55   OutputType Eval(InputType input, int, int) const {
56     OutputType output;
57     output.buf = buffer_eval_impl.Eval(input.buf);
58     return output;
59   }
60 
61   const BufferEvalImplType buffer_eval_impl;
62 };
63 
64 template <int Size>
65 struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
66                                  RegisterBuffer<std::int32_t, Size>> {
67   using InputType = RegisterBuffer<std::int32_t, Size>;
68   using OutputType = RegisterBuffer<std::int32_t, Size>;
69 
70   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
71 
72   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
73 
74   OutputType Eval(InputType input) const {
75     const int result_shift = output_stage.result_shift;
76     const std::int32_t result_mult_int = output_stage.result_mult_int;
77     using RegisterType = typename InputType::RegisterType;
78     const RegisterType result_offset =
79         Dup<RegisterType>(output_stage.result_offset);
80     OutputType output;
81     for (int i = 0; i < InputType::kRegisterCount; i++) {
82       output.reg[i] = RoundingDivideByPOT(
83           Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
84     }
85     return output;
86   }
87 
88   const OutputStage& output_stage;
89 };
90 
91 template <int Rows, int Cols, VectorShape Shape>
92 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
93                            RegisterBlock<std::int32_t, Rows, Cols>> {
94   typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
95   typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
96   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;
97 
98   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
99 
100   OutputType Eval(InputType input, int row, int col) const {
101     OutputType output;
102     const int result_shift = output_stage.result_shift;
103     const int pos = Shape == VectorShape::Col ? row : col;
104     const auto result_mult_int =
105         LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
106     const auto result_offset =
107         LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
108     const auto dividend = BroadcastMul<InputType>(
109         BroadcastAdd<InputType>(input, result_offset), result_mult_int);
110     for (int i = 0; i < InputType::kRegisterCount; i++) {
111       output.buf.reg[i] =
112           RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
113     }
114     return output;
115   }
116 
117   const OutputStage& output_stage;
118 };
119 
120 template <int Size>
121 struct OutputStageEvalBufferImpl<
122     OutputStageQuantizeDownInt32ByFixedPoint,
123     RegisterBuffer<std::int32_t, Size>> {
124   typedef RegisterBuffer<std::int32_t, Size> InputType;
125   typedef RegisterBuffer<std::int32_t, Size> OutputType;
126 
127   typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;
128 
129   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
130 
131   OutputType Eval(InputType input) const {
132     OutputType output;
133     using RegisterType = typename InputType::RegisterType;
134     const RegisterType result_offset_after_shift =
135         Dup<RegisterType>(output_stage.result_offset_after_shift);
136     for (int i = 0; i < InputType::kRegisterCount; i++) {
137       const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
138           input.reg[i], output_stage.result_fixedpoint_multiplier);
139       output.reg[i] =
140           Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
141               result_offset_after_shift);
142     }
143     return output;
144   }
145 
146   const OutputStage& output_stage;
147 };
148 
149 template <int Size>
150 struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
151                                  RegisterBuffer<std::int32_t, Size>> {
152   typedef RegisterBuffer<std::int32_t, Size> InputType;
153   typedef RegisterBuffer<std::int32_t, Size> OutputType;
154 
155   typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;
156 
157   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
158     left_shift = std::max(0, output_stage.result_exponent);
159     right_shift = std::max(0, -output_stage.result_exponent);
160   }
161 
162   OutputType Eval(InputType input) const {
163     OutputType output;
164     using RegisterType = typename InputType::RegisterType;
165     const RegisterType result_offset_after_shift =
166         Dup<RegisterType>(output_stage.result_offset_after_shift);
167     for (int i = 0; i < InputType::kRegisterCount; i++) {
168       const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
169           ShiftLeft(input.reg[i], left_shift),
170           output_stage.result_fixedpoint_multiplier);
171       output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
172                           result_offset_after_shift);
173     }
174     return output;
175   }
176 
177   const OutputStage& output_stage;
178   int left_shift;
179   int right_shift;
180 };
181 
182 // Implementation of OutputStageSaturatingCastToUint8 for scalar data
183 template <int Size>
184 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
185                                  RegisterBuffer<std::int32_t, Size>> {
186   typedef RegisterBuffer<std::int32_t, Size> InputType;
187   typedef RegisterBuffer<std::uint8_t, Size> OutputType;
188   static_assert(InputType::kRegisterLanes == 1,
189                 "This path is only for scalar values");
190 
191   typedef OutputStageSaturatingCastToUint8 OutputStage;
192 
193   OutputStageEvalBufferImpl(const OutputStage&) {}
194 
195   OutputType Eval(InputType input) const {
196     OutputType output;
197     for (int i = 0; i < InputType::kRegisterCount; i++) {
198       std::int32_t data = input.reg[i];
199       output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data;
200     }
201     return output;
202   }
203 };
204 
205 // Implementation of OutputStageSaturatingCastToInt16 for scalar data
206 template <int Size>
207 struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
208                                  RegisterBuffer<std::int32_t, Size>> {
209   typedef RegisterBuffer<std::int32_t, Size> InputType;
210   typedef RegisterBuffer<std::int16_t, Size> OutputType;
211   static_assert(InputType::kRegisterLanes == 1,
212                 "This path is only for scalar values");
213 
214   typedef OutputStageSaturatingCastToInt16 OutputStage;
215 
216   OutputStageEvalBufferImpl(const OutputStage&) {}
217 
218   OutputType Eval(InputType input) const {
219     OutputType output;
220     for (int i = 0; i < InputType::kRegisterCount; i++) {
221       std::int32_t data = input.reg[i];
222       output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data;
223     }
224     return output;
225   }
226 };
227 
228 template <int Rows, int Cols, typename VectorType>
229 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
230                            RegisterBlock<std::int32_t, Rows, Cols>> {
231   typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
232   typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
233   typedef OutputStageBiasAddition<VectorType> OutputStage;
234 
235   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
236 
237   OutputType Eval(InputType input, int row, int col) const {
238     const int pos = VectorType::kShape == VectorShape::Row ? col : row;
239     return BroadcastAdd<InputType>(
240         input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
241   }
242 
243   const OutputStage& output_stage;
244 };
245 
246 template <int Size>
247 struct OutputStageEvalBufferImpl<OutputStageClamp,
248                                  RegisterBuffer<std::int32_t, Size>> {
249   typedef RegisterBuffer<std::int32_t, Size> InputType;
250   typedef RegisterBuffer<std::int32_t, Size> OutputType;
251 
252   typedef OutputStageClamp OutputStage;
253 
254   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
255 
256   OutputType Eval(InputType input) const {
257     using RegisterType = typename InputType::RegisterType;
258     const RegisterType min = Dup<RegisterType>(output_stage.min);
259     const RegisterType max = Dup<RegisterType>(output_stage.max);
260     OutputType output;
261     for (int i = 0; i < InputType::kRegisterCount; i++) {
262       output.reg[i] = Min(Max(input.reg[i], min), max);
263     }
264     return output;
265   }
266 
267   const OutputStage& output_stage;
268 };
269 
270 template <int Size>
271 struct OutputStageEvalBufferImpl<OutputStageTanh,
272                                  RegisterBuffer<std::int32_t, Size>> {
273   typedef RegisterBuffer<std::int32_t, Size> InputType;
274   typedef RegisterBuffer<std::int32_t, Size> OutputType;
275   using RegisterType = typename InputType::RegisterType;
276   typedef RegisterType DataType;
277   typedef OutputStageTanh OutputStage;
278 
279   OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
280     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
281     const std::int32_t real_amplitude_as_int32 =
282         output_stage.real_amplitude_as_int32;
283 
284     input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
285     input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
286     output_min = real_zero_as_int32 - real_amplitude_as_int32;
287     output_max = real_zero_as_int32 + real_amplitude_as_int32;
288 
289     double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
290     inverse_amplitude_neg_exponent = 0;
291     while (inverse_amplitude_normalized_double < 0.5) {
292       inverse_amplitude_normalized_double *= 2;
293       inverse_amplitude_neg_exponent++;
294     }
295     inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble(
296         inverse_amplitude_normalized_double);
297 
298     double amplitude_normalized_double = real_amplitude_as_int32;
299     amplitude_exponent = 0;
300     while (amplitude_normalized_double >= 1.0) {
301       amplitude_normalized_double *= 0.5;
302       amplitude_exponent++;
303     }
304     amplitude_normalized =
305         FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double);
306   }
307 
308   OutputType Eval(InputType input) const {
309     const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
310 
311     typedef FixedPoint<DataType, 3> F3;
312     typedef FixedPoint<DataType, 0> F0;
313 
314     OutputType output;
315 
316     for (int i = 0; i < OutputType::kRegisterCount; i++) {
317       // fixed-point affine transformation
318       DataType input_centered =
319           Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
320       F3 fixedpoint_input =
321           F3::FromRaw(input_centered) * inverse_amplitude_normalized;
322       // left shift
323       fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
324                                          28 - inverse_amplitude_neg_exponent);
325       // fixed-point tanh and multiplication
326       F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
327       // right shift
328       DataType int32_output =
329           Add(Dup<DataType>(real_zero_as_int32),
330               ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
331 
332       DataType mask_if_below_cutoff_min =
333           MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
334       DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
335           input.reg[i], Dup<DataType>(input_cutoff_max));
336 
337       output.reg[i] = SelectUsingMask(
338           mask_if_below_cutoff_min, Dup<DataType>(output_min),
339           SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
340                           int32_output));
341     }
342     return output;
343   }
344 
345   const OutputStage& output_stage;
346   std::int32_t input_cutoff_min, input_cutoff_max;
347   std::int32_t output_min, output_max;
348   FixedPoint<DataType, 0> inverse_amplitude_normalized;
349   int inverse_amplitude_neg_exponent;
350   FixedPoint<DataType, 0> amplitude_normalized;
351   int amplitude_exponent;
352 };
353 
354 // OutputPipelineOutputType is a helper to determine the output data type of a
355 // pipeline, for a
356 // given input data type. It is a recursive template; see the explanation on
357 // OutputPipelineEvalImpl below.
358 template <typename OutputPipelineType, int FirstStage, typename InputType,
359           bool StopRecursion =
360               FirstStage == std::tuple_size<OutputPipelineType>::value>
361 struct OutputPipelineOutputType {
362   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
363       FirstStageType;
364   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
365       FirstStageOutputType;
366   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
367                                             FirstStageOutputType>::Type Type;
368 };
369 
370 template <typename OutputPipelineType, int FirstStage, typename InputType>
371 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
372                                 true> {
373   typedef InputType Type;
374 };
375 
376 // OutputPipelineEvalImpl is a helper to implement the evaluation of
377 // the whole pipeline. It is a recursive template to implement compile-time
378 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
379 // is how we implement recursion: each specialization implements only
380 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
381 // helper to implement the termination of the recursion as a partial
382 // specialization below.
383 template <typename OutputPipelineType, int FirstStage, typename InputType,
384           bool StopRecursion =
385               FirstStage == std::tuple_size<OutputPipelineType>::value>
386 struct OutputPipelineEvalImpl {
387   typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
388       FirstStageType;
389   typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
390       FirstStageOutputType;
391   typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
392                                             InputType>::Type OutputType;
393 
394   OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
395       : head_impl(std::get<FirstStage>(output_pipeline)),
396         tail_impl(output_pipeline) {}
397 
398   OutputType Eval(InputType input, int row, int col) const {
399     // Evaluate the first stage.
400     FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
401     // Recurse into the remaining stages.
402     return tail_impl.Eval(first_stage_output, row, col);
403   }
404 
405   const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
406   const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
407                                FirstStageOutputType>
408       tail_impl;
409 };
410 
411 // Specialization on 'StopRecursion' for terminating the recursion.
412 template <typename OutputPipelineType, int FirstStage, typename InputType>
413 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
414   OutputPipelineEvalImpl(const OutputPipelineType&) {}
415 
416   InputType Eval(InputType input, int, int) const {
417     // Terminating the recursion.
418     return input;
419   }
420 };
421 
422 template <typename RegisterBlockType, typename DstType>
423 struct StoreFinalOutputImpl {
424   static_assert(std::is_same<RegisterBlockType, void>::value,
425                 "This generic impl should never be hit");
426 };
427 
428 template <typename ScalarType, int Rows, int Cols, typename DstType>
429 struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
430   using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
431   static void Run(const RegisterBlockType& src, DstType* dst, int row,
432                   int col) {
433     for (int r = 0; r < Rows; r++) {
434       for (int c = 0; c < Cols; c++) {
435         *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows];
436       }
437     }
438   }
439 };
440 
441 // StoreFinalOutput takes the final value at the end of the output pipeline and
442 // stores it into the destination matrix. It can be specialized for different
443 // data types; the generic implementation here is typically used only for plain
444 // old scalar (not SIMD) types.
445 template <typename RegisterBlockType, typename DstType>
446 void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
447   StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
448 }
449 
450 template <typename OutputPipelineType, typename InputType>
451 struct OutputPipelineExecutor {
452   OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
453       : output_pipeline_eval_impl_(output_pipeline) {}
454 
455   // RunOutputPipeline is the entry point into the output pipeline evaluation
456   // code. It should be the only thing that unpack code calls. It takes the
457   // result
458   // of the unpack stage and stores it into the destination matrix.
459   template <typename DstType>
460   void Execute(InputType input, DstType* dst, int src_global_row,
461                int src_global_col, int dst_row, int dst_col) const {
462     // Statically assert that the output pipeline matches the given destination
463     // matrix's scalar type.
464     typedef typename OutputPipelineOutputType<
465         OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType
466 
467         ScalarOutputType;
468     typedef typename DstType::Scalar ScalarDstType;
469     static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
470                   "mismatched destination scalar type and output pipeline");
471 
472     // Evaluate the output pipeline.
473     auto output =
474         output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
475     // Store the result into the destination matrix.
476     StoreFinalOutput(output, dst, dst_row, dst_col);
477   }
478 
479   const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
480       output_pipeline_eval_impl_;
481 };
482 
483 }  // namespace gemmlowp
484 
485 #ifdef GEMMLOWP_NEON
486 #include "output_neon.h"
487 #elif defined(GEMMLOWP_SSE4)
488 #include "output_sse.h"
489 #elif defined(GEMMLOWP_MSA)
490 #include "output_msa.h"
491 #endif
492 
493 #endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
494