1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // output_neon.h: optimized NEON specializations of the templates in output.h. 16 17 #ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_ 18 #define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_ 19 20 #include "output.h" 21 22 #include <arm_neon.h> 23 24 namespace gemmlowp { 25 26 // Definitions of Fragment types wrapping NEON vector types. 27 typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1; 28 typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1; 29 typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1; 30 typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1; 31 32 // The code in unpack_neon.h will whenever possible process 33 // 16 entries at once (4 SIMD vectors of 4 entries each at once), 34 // to offer the compiler better optimization opportunities, reducing 35 // register dependencies. From the perspective of interfacing with the output 36 // pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t 37 // data. In most cases, such data is handled simply by handling separately its 38 // 4 int32x4_t components. This partial specialization handles that for 39 // arbitrary output stages implementing a int32x4_t path. Only some output 40 // stages below will override this to use custom code to handle int32x4x4_t 41 // data all at once (see OutputStageSaturatingCastToUint8 below). 42 template <typename OutputStageType> 43 struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> { 44 typedef NEONFragmentInt32x16x1 InputType; 45 typedef NEONFragmentInt32x16x1 OutputType; 46 typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1> 47 ImplInt32x4; 48 OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {} 49 50 OutputType Eval(InputType input, int row, int col) const { 51 OutputType output; 52 53 for (int i = 0; i < 4; i++) { 54 output.data.val[i] = 55 impl_int32x4.Eval(input.data.val[i], row + 4 * i, col); 56 } 57 return output; 58 } 59 60 ImplInt32x4 impl_int32x4; 61 }; 62 63 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for 64 // NEONFragmentInt32x4x1 65 template <> 66 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale, 67 NEONFragmentInt32x4x1> { 68 typedef NEONFragmentInt32x4x1 InputType; 69 typedef NEONFragmentInt32x4x1 OutputType; 70 typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; 71 72 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 73 74 OutputType Eval(InputType input, int, int) const { 75 const std::int32_t result_shift = output_stage.result_shift; 76 const std::int32_t result_mult_int = output_stage.result_mult_int; 77 const std::int32_t result_offset = output_stage.result_offset; 78 const std::int32_t preshift_offset = 79 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 80 const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset)); 81 const int32x4_t b = 82 vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); 83 return vshlq_s32(b, vdupq_n_s32(-result_shift)); 84 } 85 86 const OutputStage& output_stage; 87 }; 88 89 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for 90 // NEONFragmentInt32x4x1 91 template <> 92 struct OutputStageEvalImpl< 93 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>, 94 NEONFragmentInt32x4x1> { 95 typedef NEONFragmentInt32x4x1 InputType; 96 typedef NEONFragmentInt32x4x1 OutputType; 97 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col> 98 OutputStage; 99 100 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 101 102 OutputType Eval(InputType input, int row, int col) const { 103 const std::int32_t result_shift = output_stage.result_shift; 104 const std::int32_t preshift_offset = 105 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 106 const int32x4_t result_mult_int = 107 vld1q_s32(output_stage.result_mult_int.data(row)); 108 const int32x4_t result_offset = 109 vld1q_s32(output_stage.result_offset.data(row)); 110 const int32x4_t a = vaddq_s32(input, result_offset); 111 const int32x4_t b = 112 vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); 113 return vshlq_s32(b, vdupq_n_s32(-result_shift)); 114 } 115 116 const OutputStage& output_stage; 117 }; 118 119 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for 120 // NEONFragmentInt32x4x1 121 template <> 122 struct OutputStageEvalImpl< 123 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>, 124 NEONFragmentInt32x4x1> { 125 typedef NEONFragmentInt32x4x1 InputType; 126 typedef NEONFragmentInt32x4x1 OutputType; 127 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row> 128 OutputStage; 129 130 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 131 132 OutputType Eval(InputType input, int row, int col) const { 133 const std::int32_t result_shift = output_stage.result_shift; 134 const std::int32_t preshift_offset = 135 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 136 const int32x4_t result_mult_int = 137 vld1q_s32(output_stage.result_mult_int.data(col)); 138 const int32x4_t result_offset = 139 vld1q_s32(output_stage.result_offset.data(row)); 140 const int32x4_t a = vaddq_s32(input, result_offset); 141 const int32x4_t b = 142 vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); 143 return vshlq_s32(b, vdupq_n_s32(-result_shift)); 144 } 145 146 const OutputStage& output_stage; 147 }; 148 149 // Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1 150 template <> 151 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, 152 NEONFragmentInt32x4x1> { 153 typedef NEONFragmentInt32x4x1 InputType; 154 typedef NEONFragmentUint8x4x1 OutputType; 155 typedef OutputStageSaturatingCastToUint8 OutputStage; 156 157 OutputStageEvalImpl(const OutputStage&) {} 158 159 OutputType Eval(InputType input, int, int) const { 160 int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0)); 161 return vqmovun_s16(q16); 162 } 163 }; 164 165 // In the case of OutputStageSaturatingCastToUint8, the handling of 166 // NEONFragmentInt32x16x1 data can be made much more efficient by handling 167 // it all at once, instead of as 4 separate int32x4 values as in the above 168 // generic partial specialization. This also avoids the poor (50%) register 169 // utilization of FragmentUint8x4x1: by handling 16 scalar values at once, 170 // we are able to fill a uint8x16_t. 171 template <> 172 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, 173 NEONFragmentInt32x16x1> { 174 typedef NEONFragmentInt32x16x1 InputType; 175 typedef NEONFragmentUint8x16x1 OutputType; 176 typedef OutputStageSaturatingCastToUint8 OutputStage; 177 178 OutputStageEvalImpl(const OutputStage&) {} 179 180 OutputType Eval(InputType input, int, int) const { 181 int16x8_t q16[2]; 182 for (int i = 0; i < 2; i++) { 183 q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]), 184 vqmovn_s32(input.data.val[2 * i + 1])); 185 } 186 return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1])); 187 } 188 }; 189 190 // Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1 191 template <typename VectorType> 192 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, 193 NEONFragmentInt32x4x1> { 194 typedef NEONFragmentInt32x4x1 InputType; 195 typedef NEONFragmentInt32x4x1 OutputType; 196 typedef OutputStageBiasAddition<VectorType> OutputStage; 197 198 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 199 200 OutputType Eval(InputType input, int row, int col) const { 201 int32x4_t bias; 202 if (VectorType::kShape == VectorShape::Row) { 203 bias = vdupq_n_s32(output_stage.bias_vector(col)); 204 } else { 205 bias = vld1q_s32(output_stage.bias_vector.data(row)); 206 } 207 return vaddq_s32(input, bias); 208 } 209 210 const OutputStage& output_stage; 211 }; 212 213 // Implementation of OutputStageClamp for NEONFragmentInt32x4x1 214 template <> 215 struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> { 216 typedef NEONFragmentInt32x4x1 InputType; 217 typedef NEONFragmentInt32x4x1 OutputType; 218 typedef OutputStageClamp OutputStage; 219 220 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 221 222 OutputType Eval(InputType input, int, int) const { 223 const int32x4_t min = vdupq_n_s32(output_stage.min); 224 const int32x4_t max = vdupq_n_s32(output_stage.max); 225 return vminq_s32(vmaxq_s32(input, min), max); 226 } 227 228 const OutputStage& output_stage; 229 }; 230 231 // Implementation of OutputStageTanh for NEONFragmentInt32x4x1 232 template <> 233 struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1> 234 : OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> { 235 OutputStageEvalImpl(const OutputStageTanh& output_stage) 236 : OutputStageTanhEvalImpl(output_stage) {} 237 }; 238 239 // Specialization of StoreFinalOutput for NEONFragmentUint8x4x1. 240 // This is quite inefficient, but we have no choice: instructions storing 32bit 241 // at once also assume 32bit alignment. In practice, this slowness is not a 242 // problem because we use the x16 path for most values. 243 template <typename DstType> 244 inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row, 245 int col) { 246 vst1_lane_u8(dst->data(row + 0, col), value, 0); 247 vst1_lane_u8(dst->data(row + 1, col), value, 1); 248 vst1_lane_u8(dst->data(row + 2, col), value, 2); 249 vst1_lane_u8(dst->data(row + 3, col), value, 3); 250 } 251 252 // Specialization of StoreFinalOutput for NEONFragmentUint8x16x1. 253 template <typename DstType> 254 inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst, 255 int row, int col) { 256 vst1q_u8(dst->data(row, col), value); 257 } 258 259 // Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a 260 // int32 destination. 261 template <typename DstType> 262 inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row, 263 int col) { 264 vst1q_s32(dst->data(row, col), value); 265 } 266 267 // Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into 268 // a int32 destination. 269 template <typename DstType> 270 inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst, 271 int row, int col) { 272 for (int i = 0; i < 4; i++) { 273 vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]); 274 } 275 } 276 277 } // namespace gemmlowp 278 279 #endif // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_ 280