1 /*
2 * Copyright 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19
20 #include <array> // std::size
21 #include <type_traits>
22
23 /*
24 The intrinsics utility library contain helper functions for wide width DSP support.
25 We use templated types to allow testing from scalar to vector values.
26
27 See the Eigen project for general abstracted linear algebra acceleration.
28 http://eigen.tuxfamily.org/
29 */
30
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39
40 namespace android::audio_utils::intrinsics {
41
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51 T v[N];
52 };
53
54 /*
55 Generalized template functions for the Neon instruction set.
56
57 See here for some general comments from ARM.
58 https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
59
60 Notes:
61 1) We provide scalar equivalents which are compilable even on non-ARM processors.
62 2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
63 3) NEON double SIMD acceleration is only available on 64 bit architectures.
64 On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
65
66 We create a generic Neon acceleration to be applied to a composite type.
67
68 The type follows the following compositional rules for simplicity:
69 1) must be a primitive floating point type.
70 2) must be a NEON data type.
71 3) must be a struct with one member, either
72 a) an array of types 1-3.
73 b) a cons-pair struct of 2 possibly different members of types 1-3.
74
75 Examples of possible struct definitions:
76 using alternative_2_t = struct { struct { float a; float b; } s; };
77 using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
78 using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
79 */
80
81 // duplicate float into all elements.
82 template<typename T, typename F>
vdupn(F f)83 static inline T vdupn(F f) {
84 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
85 return f;
86
87 #ifdef USE_NEON
88 } else if constexpr (std::is_same_v<T, float32x2_t>) {
89 return vdup_n_f32(f);
90 } else if constexpr (std::is_same_v<T, float32x4_t>) {
91 return vdupq_n_f32(f);
92 #if defined(__aarch64__)
93 } else if constexpr (std::is_same_v<T, float64x2_t>) {
94 return vdupq_n_f64(f);
95 #endif
96 #endif // USE_NEON
97
98 } else /* constexpr */ {
99 T ret;
100 auto &[retval] = ret; // single-member struct
101 if constexpr (std::is_array_v<decltype(retval)>) {
102 #pragma unroll
103 for (auto& val : retval) {
104 val = vdupn<std::decay_t<decltype(val)>>(f);
105 }
106 return ret;
107 } else /* constexpr */ {
108 auto &[r1, r2] = retval;
109 using r1_type = std::decay_t<decltype(r1)>;
110 using r2_type = std::decay_t<decltype(r2)>;
111 r1 = vdupn<r1_type>(f);
112 r2 = vdupn<r2_type>(f);
113 return ret;
114 }
115 }
116 }
117
118 // load from float pointer.
119 template<typename T, typename F>
vld1(const F * f)120 static inline T vld1(const F *f) {
121 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
122 return *f;
123
124 #ifdef USE_NEON
125 } else if constexpr (std::is_same_v<T, float32x2_t>) {
126 return vld1_f32(f);
127 } else if constexpr (std::is_same_v<T, float32x4_t>) {
128 return vld1q_f32(f);
129 #if defined(__aarch64__)
130 } else if constexpr (std::is_same_v<T, float64x2_t>) {
131 return vld1q_f64(f);
132 #endif
133 #endif // USE_NEON
134
135 } else /* constexpr */ {
136 T ret;
137 auto &[retval] = ret; // single-member struct
138 if constexpr (std::is_array_v<decltype(retval)>) {
139 using element_type = std::decay_t<decltype(retval[0])>;
140 constexpr size_t subelements = sizeof(element_type) / sizeof(F);
141 #pragma unroll
142 for (size_t i = 0; i < std::size(retval); ++i) {
143 retval[i] = vld1<element_type>(f);
144 f += subelements;
145 }
146 return ret;
147 } else /* constexpr */ {
148 auto &[r1, r2] = retval;
149 using r1_type = std::decay_t<decltype(r1)>;
150 using r2_type = std::decay_t<decltype(r2)>;
151 r1 = vld1<r1_type>(f);
152 f += sizeof(r1) / sizeof(F);
153 r2 = vld1<r2_type>(f);
154 return ret;
155 }
156 }
157 }
158
159 // fused multiply-add a + b * c
160 template<typename T>
vmla(T a,T b,T c)161 static inline T vmla(T a, T b, T c) {
162 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
163 return a + b * c;
164
165 #ifdef USE_NEON
166 } else if constexpr (std::is_same_v<T, float32x2_t>) {
167 return vmla_f32(a, b, c);
168 } else if constexpr (std::is_same_v<T, float32x4_t>) {
169 return vmlaq_f32(a, b, c);
170 #if defined(__aarch64__)
171 } else if constexpr (std::is_same_v<T, float64x2_t>) {
172 return vmlaq_f64(a, b, c);
173 #endif
174 #endif // USE_NEON
175
176 } else /* constexpr */ {
177 T ret;
178 auto &[retval] = ret; // single-member struct
179 const auto &[aval] = a;
180 const auto &[bval] = b;
181 const auto &[cval] = c;
182 if constexpr (std::is_array_v<decltype(retval)>) {
183 #pragma unroll
184 for (size_t i = 0; i < std::size(aval); ++i) {
185 retval[i] = vmla(aval[i], bval[i], cval[i]);
186 }
187 return ret;
188 } else /* constexpr */ {
189 auto &[r1, r2] = retval;
190 const auto &[a1, a2] = aval;
191 const auto &[b1, b2] = bval;
192 const auto &[c1, c2] = cval;
193 r1 = vmla(a1, b1, c1);
194 r2 = vmla(a2, b2, c2);
195 return ret;
196 }
197 }
198 }
199
200 // multiply a * b
201 template<typename T>
vmul(T a,T b)202 static inline T vmul(T a, T b) {
203 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
204 return a * b;
205
206 #ifdef USE_NEON
207 } else if constexpr (std::is_same_v<T, float32x2_t>) {
208 return vmul_f32(a, b);
209 } else if constexpr (std::is_same_v<T, float32x4_t>) {
210 return vmulq_f32(a, b);
211 #if defined(__aarch64__)
212 } else if constexpr (std::is_same_v<T, float64x2_t>) {
213 return vmulq_f64(a, b);
214 #endif
215 #endif // USE_NEON
216
217 } else /* constexpr */ {
218 T ret;
219 auto &[retval] = ret; // single-member struct
220 const auto &[aval] = a;
221 const auto &[bval] = b;
222 if constexpr (std::is_array_v<decltype(retval)>) {
223 #pragma unroll
224 for (size_t i = 0; i < std::size(aval); ++i) {
225 retval[i] = vmul(aval[i], bval[i]);
226 }
227 return ret;
228 } else /* constexpr */ {
229 auto &[r1, r2] = retval;
230 const auto &[a1, a2] = aval;
231 const auto &[b1, b2] = bval;
232 r1 = vmul(a1, b1);
233 r2 = vmul(a2, b2);
234 return ret;
235 }
236 }
237 }
238
239 // negate
240 template<typename T>
vneg(T f)241 static inline T vneg(T f) {
242 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
243 return -f;
244
245 #ifdef USE_NEON
246 } else if constexpr (std::is_same_v<T, float32x2_t>) {
247 return vneg_f32(f);
248 } else if constexpr (std::is_same_v<T, float32x4_t>) {
249 return vnegq_f32(f);
250 #if defined(__aarch64__)
251 } else if constexpr (std::is_same_v<T, float64x2_t>) {
252 return vnegq_f64(f);
253 #endif
254 #endif // USE_NEON
255
256 } else /* constexpr */ {
257 T ret;
258 auto &[retval] = ret; // single-member struct
259 const auto &[fval] = f;
260 if constexpr (std::is_array_v<decltype(retval)>) {
261 #pragma unroll
262 for (size_t i = 0; i < std::size(fval); ++i) {
263 retval[i] = vneg(fval[i]);
264 }
265 return ret;
266 } else /* constexpr */ {
267 auto &[r1, r2] = retval;
268 const auto &[f1, f2] = fval;
269 r1 = vneg(f1);
270 r2 = vneg(f2);
271 return ret;
272 }
273 }
274 }
275
276 // store to float pointer.
277 template<typename T, typename F>
vst1(F * f,T a)278 static inline void vst1(F *f, T a) {
279 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
280 *f = a;
281
282 #ifdef USE_NEON
283 } else if constexpr (std::is_same_v<T, float32x2_t>) {
284 return vst1_f32(f, a);
285 } else if constexpr (std::is_same_v<T, float32x4_t>) {
286 return vst1q_f32(f, a);
287 #if defined(__aarch64__)
288 } else if constexpr (std::is_same_v<T, float64x2_t>) {
289 return vst1q_f64(f, a);
290 #endif
291 #endif // USE_NEON
292
293 } else /* constexpr */ {
294 const auto &[aval] = a;
295 if constexpr (std::is_array_v<decltype(aval)>) {
296 constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
297 #pragma unroll
298 for (size_t i = 0; i < std::size(aval); ++i) {
299 vst1(f, aval[i]);
300 f += subelements;
301 }
302 } else /* constexpr */ {
303 const auto &[a1, a2] = aval;
304 vst1(f, a1);
305 f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
306 vst1(f, a2);
307 }
308 }
309 }
310
311 } // namespace android::audio_utils::intrinsics
312
313 #pragma pop_macro("USE_NEON")
314
315 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
316