1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // output_neon.h: optimized NEON specializations of the templates in output.h.
16 
17 #ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
18 #define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
19 
20 #include "output.h"
21 
22 #include <arm_neon.h>
23 
24 namespace gemmlowp {
25 
26 // Definitions of Fragment types wrapping NEON vector types.
27 typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1;
28 typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1;
29 typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1;
30 typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1;
31 
32 // The code in unpack_neon.h will whenever possible process
33 // 16 entries at once (4 SIMD vectors of 4 entries each at once),
34 // to offer the compiler better optimization opportunities, reducing
35 // register dependencies. From the perspective of interfacing with the output
36 // pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t
37 // data. In most cases, such data is handled simply by handling separately its
38 // 4 int32x4_t components. This partial specialization handles that for
39 // arbitrary output stages implementing a int32x4_t path. Only some output
40 // stages below will override this to use custom code to handle int32x4x4_t
41 // data all at once (see OutputStageSaturatingCastToUint8 below).
42 template <typename OutputStageType>
43 struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> {
44   typedef NEONFragmentInt32x16x1 InputType;
45   typedef NEONFragmentInt32x16x1 OutputType;
46   typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1>
47       ImplInt32x4;
48   OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {}
49 
50   OutputType Eval(InputType input, int row, int col) const {
51     OutputType output;
52 
53     for (int i = 0; i < 4; i++) {
54       output.data.val[i] =
55           impl_int32x4.Eval(input.data.val[i], row + 4 * i, col);
56     }
57     return output;
58   }
59 
60   ImplInt32x4 impl_int32x4;
61 };
62 
63 // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for
64 // NEONFragmentInt32x4x1
65 template <>
66 struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
67                            NEONFragmentInt32x4x1> {
68   typedef NEONFragmentInt32x4x1 InputType;
69   typedef NEONFragmentInt32x4x1 OutputType;
70   typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
71 
72   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
73 
74   OutputType Eval(InputType input, int, int) const {
75     const std::int32_t result_shift = output_stage.result_shift;
76     const std::int32_t result_mult_int = output_stage.result_mult_int;
77     const std::int32_t result_offset = output_stage.result_offset;
78     const std::int32_t preshift_offset =
79         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
80     const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset));
81     const int32x4_t b =
82         vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
83     return vshlq_s32(b, vdupq_n_s32(-result_shift));
84   }
85 
86   const OutputStage& output_stage;
87 };
88 
89 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
90 // NEONFragmentInt32x4x1
91 template <>
92 struct OutputStageEvalImpl<
93     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
94     NEONFragmentInt32x4x1> {
95   typedef NEONFragmentInt32x4x1 InputType;
96   typedef NEONFragmentInt32x4x1 OutputType;
97   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
98       OutputStage;
99 
100   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
101 
102   OutputType Eval(InputType input, int row, int col) const {
103     const std::int32_t result_shift = output_stage.result_shift;
104     const std::int32_t preshift_offset =
105         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
106     const int32x4_t result_mult_int =
107         vld1q_s32(output_stage.result_mult_int.data(row));
108     const int32x4_t result_offset =
109         vld1q_s32(output_stage.result_offset.data(row));
110     const int32x4_t a = vaddq_s32(input, result_offset);
111     const int32x4_t b =
112         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
113     return vshlq_s32(b, vdupq_n_s32(-result_shift));
114   }
115 
116   const OutputStage& output_stage;
117 };
118 
119 // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for
120 // NEONFragmentInt32x4x1
121 template <>
122 struct OutputStageEvalImpl<
123     OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
124     NEONFragmentInt32x4x1> {
125   typedef NEONFragmentInt32x4x1 InputType;
126   typedef NEONFragmentInt32x4x1 OutputType;
127   typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
128       OutputStage;
129 
130   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
131 
132   OutputType Eval(InputType input, int row, int col) const {
133     const std::int32_t result_shift = output_stage.result_shift;
134     const std::int32_t preshift_offset =
135         (result_shift < 1) ? 0 : (1 << (result_shift - 1));
136     const int32x4_t result_mult_int =
137         vld1q_s32(output_stage.result_mult_int.data(col));
138     const int32x4_t result_offset =
139         vld1q_s32(output_stage.result_offset.data(row));
140     const int32x4_t a = vaddq_s32(input, result_offset);
141     const int32x4_t b =
142         vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int);
143     return vshlq_s32(b, vdupq_n_s32(-result_shift));
144   }
145 
146   const OutputStage& output_stage;
147 };
148 
149 // Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1
150 template <>
151 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
152                            NEONFragmentInt32x4x1> {
153   typedef NEONFragmentInt32x4x1 InputType;
154   typedef NEONFragmentUint8x4x1 OutputType;
155   typedef OutputStageSaturatingCastToUint8 OutputStage;
156 
157   OutputStageEvalImpl(const OutputStage&) {}
158 
159   OutputType Eval(InputType input, int, int) const {
160     int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0));
161     return vqmovun_s16(q16);
162   }
163 };
164 
165 // In the case of OutputStageSaturatingCastToUint8, the handling of
166 // NEONFragmentInt32x16x1 data can be made much more efficient by handling
167 // it all at once, instead of as 4 separate int32x4 values as in the above
168 // generic partial specialization. This also avoids the poor (50%) register
169 // utilization of FragmentUint8x4x1: by handling 16 scalar values at once,
170 // we are able to fill a uint8x16_t.
171 template <>
172 struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
173                            NEONFragmentInt32x16x1> {
174   typedef NEONFragmentInt32x16x1 InputType;
175   typedef NEONFragmentUint8x16x1 OutputType;
176   typedef OutputStageSaturatingCastToUint8 OutputStage;
177 
178   OutputStageEvalImpl(const OutputStage&) {}
179 
180   OutputType Eval(InputType input, int, int) const {
181     int16x8_t q16[2];
182     for (int i = 0; i < 2; i++) {
183       q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]),
184                             vqmovn_s32(input.data.val[2 * i + 1]));
185     }
186     return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1]));
187   }
188 };
189 
190 // Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1
191 template <typename VectorType>
192 struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
193                            NEONFragmentInt32x4x1> {
194   typedef NEONFragmentInt32x4x1 InputType;
195   typedef NEONFragmentInt32x4x1 OutputType;
196   typedef OutputStageBiasAddition<VectorType> OutputStage;
197 
198   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
199 
200   OutputType Eval(InputType input, int row, int col) const {
201     int32x4_t bias;
202     if (VectorType::kShape == VectorShape::Row) {
203       bias = vdupq_n_s32(output_stage.bias_vector(col));
204     } else {
205       bias = vld1q_s32(output_stage.bias_vector.data(row));
206     }
207     return vaddq_s32(input, bias);
208   }
209 
210   const OutputStage& output_stage;
211 };
212 
213 // Implementation of OutputStageClamp for NEONFragmentInt32x4x1
214 template <>
215 struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> {
216   typedef NEONFragmentInt32x4x1 InputType;
217   typedef NEONFragmentInt32x4x1 OutputType;
218   typedef OutputStageClamp OutputStage;
219 
220   OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
221 
222   OutputType Eval(InputType input, int, int) const {
223     const int32x4_t min = vdupq_n_s32(output_stage.min);
224     const int32x4_t max = vdupq_n_s32(output_stage.max);
225     return vminq_s32(vmaxq_s32(input, min), max);
226   }
227 
228   const OutputStage& output_stage;
229 };
230 
231 // Implementation of OutputStageTanh for NEONFragmentInt32x4x1
232 template <>
233 struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1>
234     : OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> {
235   OutputStageEvalImpl(const OutputStageTanh& output_stage)
236       : OutputStageTanhEvalImpl(output_stage) {}
237 };
238 
239 // Specialization of StoreFinalOutput for NEONFragmentUint8x4x1.
240 // This is quite inefficient, but we have no choice: instructions storing 32bit
241 // at once also assume 32bit alignment. In practice, this slowness is not a
242 // problem because we use the x16 path for most values.
243 template <typename DstType>
244 inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row,
245                              int col) {
246   vst1_lane_u8(dst->data(row + 0, col), value, 0);
247   vst1_lane_u8(dst->data(row + 1, col), value, 1);
248   vst1_lane_u8(dst->data(row + 2, col), value, 2);
249   vst1_lane_u8(dst->data(row + 3, col), value, 3);
250 }
251 
252 // Specialization of StoreFinalOutput for NEONFragmentUint8x16x1.
253 template <typename DstType>
254 inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst,
255                              int row, int col) {
256   vst1q_u8(dst->data(row, col), value);
257 }
258 
259 // Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a
260 // int32 destination.
261 template <typename DstType>
262 inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row,
263                              int col) {
264   vst1q_s32(dst->data(row, col), value);
265 }
266 
267 // Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into
268 // a int32 destination.
269 template <typename DstType>
270 inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst,
271                              int row, int col) {
272   for (int i = 0; i < 4; i++) {
273     vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]);
274   }
275 }
276 
277 }  // namespace gemmlowp
278 
279 #endif  // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
280