1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // output.h: processing the 32-bit accumulators output by the unpack 16 // stage, obtaining the final result matrix entries and storing them into 17 // the destination matrix. 18 19 #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ 20 #define GEMMLOWP_INTERNAL_OUTPUT_H_ 21 22 #include <cmath> 23 #include <tuple> 24 #include <type_traits> 25 26 #include "../public/output_stages.h" 27 #include "fixedpoint.h" 28 29 namespace gemmlowp { 30 31 // A Fragment is a small fixed-size matrix typically stored in one or 32 // a few architecture-specific SIMD vectors. Besides plain old scalar types 33 // such as int32_t, Fragment types are what can be used as input/output data 34 // types for output pipeline stages. 35 // 36 // More details: 37 // 38 // In the generic scalar code in this file, we have only implemented 39 // evaluation of output stages for scalar inputs (e.g. plain int32_t values). 40 // Other files (e.g. output_neon.h) are to provide SIMD paths by implementing 41 // evaluation of output stages for SIMD vector types. However, this raises 42 // the question of how the different values ("lanes") in a SIMD vector 43 // correspond to different values in the whole matrices. For simple entry-wise 44 // output stages, this doesn't matter, but for other output stages depending 45 // on position within the whole matrix, this does matter. To solve this 46 // problem, rather than implementing evaluation of output stages for raw 47 // SIMD vector types, we wrap SIMD vector types in "fragment" structs that 48 // bring the additional structure of "shape" i.e. mapping SIMD lanes to 49 // matrix entries, and we specialize evaluation of output stage for such 50 // fragment types. The Fragment template struct here is how we generate 51 // all fragment structs. For example, in output_neon.h, it may be specialized 52 // with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for 53 // vector shapes. While Fragment is only used for SIMD paths, we leave it 54 // here in this platform-generic file because this same template should 55 // cover the needs of any SIMD architectures. 56 template <typename tDataType, int tRows, int tCols, MapOrder tOrder> 57 struct Fragment { 58 typedef tDataType DataType; 59 static const int kRows = tRows; 60 static const int kCols = tCols; 61 static const MapOrder kOrder = tOrder; 62 FragmentFragment63 Fragment() {} FragmentFragment64 Fragment(const DataType& d) : data(d) {} DataTypeFragment65 operator DataType() const { return data; } 66 67 DataType data; 68 }; 69 70 typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1; 71 typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1; 72 73 // OutputStageEvalImpl is the template that we specialize to provide 74 // implementations of each output stage for each type of input data. 75 // 76 // Each specialization provides a OutputType typedef and an Eval function 77 // returning OutputType. The OutputType typically depends on the InputType. 78 // 79 // There are two dimensions in which input data types can vary: 80 // 1. Different output stages may expect different data types. The 81 // only hard constraint is that the first stage accepts int32, as 82 // the unpack stage produces int32 accumulators. 83 // 2. For a given scalar data type such as int32, there is still the 84 // possibility of having SIMD vector types such as NEON int32x4_t, 85 // typically wrapped as "fragment" types, see struct Fragment. 86 // Thus, there can be several OutputStageEvalImpl 87 // specializations for a single OutputStageType, for different 88 // InputType's. 89 template <typename OutputStageType, typename InputType> 90 struct OutputStageEvalImpl { 91 // This generic template body should never be hit. 92 static_assert( 93 std::is_same<InputType, void>::value, 94 "Unimplemented: missing implementation of this output pipeline stage " 95 "for this data type. This would happen if some architecture-specific " 96 "SIMD back-end (output_$arch.h) were incomplete."); 97 OutputStageEvalImplOutputStageEvalImpl98 OutputStageEvalImpl(const OutputStageType&) {} 99 }; 100 101 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data 102 template <> 103 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale, 104 FragmentInt32x1x1> { 105 typedef FragmentInt32x1x1 InputType; 106 typedef FragmentInt32x1x1 OutputType; 107 typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; 108 109 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 110 111 OutputType Eval(InputType input, int, int) const { 112 const std::int32_t result_shift = output_stage.result_shift; 113 const std::int32_t result_mult_int = output_stage.result_mult_int; 114 const std::int32_t result_offset = output_stage.result_offset; 115 const std::int32_t kRoundingTerm = 116 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 117 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 118 result_shift; 119 } 120 121 const OutputStage& output_stage; 122 }; 123 124 template <> 125 struct OutputStageEvalImpl< 126 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>, 127 FragmentInt32x1x1> { 128 typedef FragmentInt32x1x1 InputType; 129 typedef FragmentInt32x1x1 OutputType; 130 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col> 131 OutputStage; 132 133 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 134 135 OutputType Eval(InputType input, int row, int col) const { 136 const std::int32_t result_shift = output_stage.result_shift; 137 const std::int32_t result_mult_int = output_stage.result_mult_int(row); 138 const std::int32_t result_offset = output_stage.result_offset(row); 139 const std::int32_t kRoundingTerm = 140 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 141 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 142 result_shift; 143 } 144 145 const OutputStage& output_stage; 146 }; 147 148 template <> 149 struct OutputStageEvalImpl< 150 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>, 151 FragmentInt32x1x1> { 152 typedef FragmentInt32x1x1 InputType; 153 typedef FragmentInt32x1x1 OutputType; 154 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row> 155 OutputStage; 156 157 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 158 159 OutputType Eval(InputType input, int row, int col) const { 160 const std::int32_t result_shift = output_stage.result_shift; 161 const std::int32_t result_mult_int = output_stage.result_mult_int(col); 162 const std::int32_t result_offset = output_stage.result_offset(col); 163 const std::int32_t kRoundingTerm = 164 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 165 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 166 result_shift; 167 } 168 169 const OutputStage& output_stage; 170 }; 171 172 // Implementation of OutputStageSaturatingCastToUint8 for scalar data 173 template <> 174 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, 175 FragmentInt32x1x1> { 176 typedef FragmentInt32x1x1 InputType; 177 typedef FragmentUint8x1x1 OutputType; 178 typedef OutputStageSaturatingCastToUint8 OutputStage; 179 180 OutputStageEvalImpl(const OutputStage&) {} 181 182 OutputType Eval(InputType input, int, int) const { 183 std::int32_t data = input.data; 184 return data > 255 ? 255 : data < 0 ? 0 : data; 185 } 186 }; 187 188 // Implementation of OutputStageBiasAddition for scalar data 189 template <typename VectorType> 190 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, 191 FragmentInt32x1x1> { 192 typedef FragmentInt32x1x1 InputType; 193 typedef FragmentInt32x1x1 OutputType; 194 typedef OutputStageBiasAddition<VectorType> OutputStage; 195 196 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 197 198 OutputType Eval(InputType input, int row, int col) const { 199 if (VectorType::kShape == VectorShape::Row) { 200 return input + output_stage.bias_vector(col); 201 } else { 202 return input + output_stage.bias_vector(row); 203 } 204 } 205 206 const OutputStage& output_stage; 207 }; 208 209 // Implementation of OutputStageClamp for scalar data 210 template <> 211 struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> { 212 typedef FragmentInt32x1x1 InputType; 213 typedef FragmentInt32x1x1 OutputType; 214 typedef OutputStageClamp OutputStage; 215 216 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 217 218 OutputType Eval(InputType input, int, int) const { 219 const std::int32_t min = output_stage.min; 220 const std::int32_t max = output_stage.max; 221 return std::min(std::max(input.data, min), max); 222 } 223 224 const OutputStage& output_stage; 225 }; 226 227 // Implementation of OutputStageTanh for either scalar or SIMD data 228 template <typename tInputType> 229 struct OutputStageTanhEvalImpl { 230 typedef tInputType InputType; 231 typedef InputType OutputType; 232 typedef typename InputType::DataType DataType; 233 typedef OutputStageTanh OutputStage; 234 235 OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) { 236 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 237 const std::int32_t real_amplitude_as_int32 = 238 output_stage.real_amplitude_as_int32; 239 240 input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; 241 input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; 242 output_min = real_zero_as_int32 - real_amplitude_as_int32; 243 output_max = real_zero_as_int32 + real_amplitude_as_int32; 244 245 double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; 246 inverse_amplitude_neg_exponent = 0; 247 while (inverse_amplitude_normalized_double < 0.5) { 248 inverse_amplitude_normalized_double *= 2; 249 inverse_amplitude_neg_exponent++; 250 } 251 inverse_amplitude_normalized = 252 ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double); 253 254 double amplitude_normalized_double = real_amplitude_as_int32; 255 amplitude_exponent = 0; 256 while (amplitude_normalized_double >= 1.0) { 257 amplitude_normalized_double *= 0.5; 258 amplitude_exponent++; 259 } 260 amplitude_normalized = 261 ToFixedPoint<DataType, 0>(amplitude_normalized_double); 262 } 263 264 OutputType Eval(InputType input, int, int) const { 265 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 266 267 typedef FixedPoint<DataType, 3> F3; 268 typedef FixedPoint<DataType, 0> F0; 269 270 // fixed-point affine transformation 271 DataType input_centered = 272 Sub(input.data, Dup<DataType>(real_zero_as_int32)); 273 F3 fixedpoint_input = 274 F3::FromRaw(input_centered) * inverse_amplitude_normalized; 275 // left shift 276 fixedpoint_input.raw() = 277 ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent); 278 // fixed-point tanh and multiplication 279 F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; 280 // right shift 281 DataType int32_output = 282 Add(Dup<DataType>(real_zero_as_int32), 283 ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); 284 285 DataType mask_if_below_cutoff_min = 286 MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min)); 287 DataType mask_if_above_cutoff_max = 288 MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max)); 289 290 return SelectUsingMask( 291 mask_if_below_cutoff_min, Dup<DataType>(output_min), 292 SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), 293 int32_output)); 294 } 295 296 const OutputStage& output_stage; 297 std::int32_t input_cutoff_min, input_cutoff_max; 298 std::int32_t output_min, output_max; 299 FixedPoint<DataType, 0> inverse_amplitude_normalized; 300 int inverse_amplitude_neg_exponent; 301 FixedPoint<DataType, 0> amplitude_normalized; 302 int amplitude_exponent; 303 }; 304 305 template <> 306 struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1> 307 : OutputStageTanhEvalImpl<FragmentInt32x1x1> { 308 OutputStageEvalImpl(const OutputStageTanh& output_stage) 309 : OutputStageTanhEvalImpl(output_stage) {} 310 }; 311 312 // OutputPipelineOutputType is a helper to determine the output data type of a 313 // pipeline, for a 314 // given input data type. It is a recursive template; see the explanation on 315 // OutputPipelineEvalImpl below. 316 template <typename OutputPipelineType, int FirstStage, typename InputType, 317 bool StopRecursion = 318 FirstStage == std::tuple_size<OutputPipelineType>::value> 319 struct OutputPipelineOutputType { 320 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 321 FirstStageType; 322 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 323 FirstStageOutputType; 324 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, 325 FirstStageOutputType>::Type Type; 326 }; 327 328 template <typename OutputPipelineType, int FirstStage, typename InputType> 329 struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, 330 true> { 331 typedef InputType Type; 332 }; 333 334 // OutputPipelineEvalImpl is a helper to implement the evaluation of 335 // the whole pipeline. It is a recursive template to implement compile-time 336 // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter 337 // is how we implement recursion: each specialization implements only 338 // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a 339 // helper to implement the termination of the recursion as a partial 340 // specialization below. 341 template <typename OutputPipelineType, int FirstStage, typename InputType, 342 bool StopRecursion = 343 FirstStage == std::tuple_size<OutputPipelineType>::value> 344 struct OutputPipelineEvalImpl { 345 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 346 FirstStageType; 347 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 348 FirstStageOutputType; 349 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, 350 InputType>::Type OutputType; 351 352 OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) 353 : head_impl(std::get<FirstStage>(output_pipeline)), 354 tail_impl(output_pipeline) {} 355 356 OutputType Eval(InputType input, int row, int col) const { 357 // Evaluate the first stage. 358 FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); 359 // Recurse into the remaining stages. 360 return tail_impl.Eval(first_stage_output, row, col); 361 } 362 363 const OutputStageEvalImpl<FirstStageType, InputType> head_impl; 364 const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, 365 FirstStageOutputType> 366 tail_impl; 367 }; 368 369 // Specialization on 'StopRecursion' for terminating the recursion. 370 template <typename OutputPipelineType, int FirstStage, typename InputType> 371 struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { 372 OutputPipelineEvalImpl(const OutputPipelineType&) {} 373 374 InputType Eval(InputType input, int, int) const { 375 // Terminating the recursion. 376 return input; 377 } 378 }; 379 380 // StoreFinalOutput takes the final value at the end of the output pipeline and 381 // stores it into the destination matrix. It can be specialized for different 382 // data types; the generic implementation here is typically used only for plain 383 // old scalar (not SIMD) types. 384 template <typename OutputType, typename DstType> 385 void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) { 386 *dst->data(row, col) = value; 387 } 388 389 template <typename OutputPipelineType, typename InputType> 390 struct OutputPipelineExecutor { 391 OutputPipelineExecutor(const OutputPipelineType& output_pipeline) 392 : output_pipeline_eval_impl_(output_pipeline) {} 393 394 // RunOutputPipeline is the entry point into the output pipeline evaluation 395 // code. It should be the only thing that unpack code calls. It takes the 396 // result 397 // of the unpack stage and stores it into the destination matrix. 398 template <typename DstType> 399 void Execute(InputType input, DstType* dst, int row, int col) { 400 // Statically assert that the output pipeline matches the given destination 401 // matrix's scalar type. 402 typedef typename OutputPipelineOutputType<OutputPipelineType, 0, 403 FragmentInt32x1x1>::Type::DataType 404 ScalarOutputType; 405 typedef typename DstType::Scalar ScalarDstType; 406 static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, 407 "mismatched destination scalar type and output pipeline"); 408 409 // Evaluate the output pipeline. 410 auto output = output_pipeline_eval_impl_.Eval(input, row, col); 411 // Store the result into the destination matrix. 412 StoreFinalOutput(output, dst, row, col); 413 } 414 415 const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> 416 output_pipeline_eval_impl_; 417 }; 418 419 } // namespace gemmlowp 420 421 #endif // GEMMLOWP_INTERNAL_OUTPUT_H_ 422