1 /*
2  * Copyright 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19 
20 #include <array>  // std::size
21 #include <type_traits>
22 
23 /*
24   The intrinsics utility library contain helper functions for wide width DSP support.
25   We use templated types to allow testing from scalar to vector values.
26 
27   See the Eigen project for general abstracted linear algebra acceleration.
28   http://eigen.tuxfamily.org/
29 */
30 
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34 
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39 
40 namespace android::audio_utils::intrinsics {
41 
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46 
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51     T v[N];
52 };
53 
54 /*
55   Generalized template functions for the Neon instruction set.
56 
57   See here for some general comments from ARM.
58   https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
59 
60   Notes:
61   1) We provide scalar equivalents which are compilable even on non-ARM processors.
62   2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
63   3) NEON double SIMD acceleration is only available on 64 bit architectures.
64      On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
65 
66   We create a generic Neon acceleration to be applied to a composite type.
67 
68   The type follows the following compositional rules for simplicity:
69       1) must be a primitive floating point type.
70       2) must be a NEON data type.
71       3) must be a struct with one member, either
72            a) an array of types 1-3.
73            b) a cons-pair struct of 2 possibly different members of types 1-3.
74 
75   Examples of possible struct definitions:
76   using alternative_2_t = struct { struct { float a; float b; } s; };
77   using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
78   using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
79 */
80 
81 // duplicate float into all elements.
82 template<typename T, typename F>
vdupn(F f)83 static inline T vdupn(F f) {
84     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
85         return f;
86 
87 #ifdef USE_NEON
88     } else if constexpr (std::is_same_v<T, float32x2_t>) {
89         return vdup_n_f32(f);
90     } else if constexpr (std::is_same_v<T, float32x4_t>) {
91         return vdupq_n_f32(f);
92 #if defined(__aarch64__)
93     } else if constexpr (std::is_same_v<T, float64x2_t>) {
94         return vdupq_n_f64(f);
95 #endif
96 #endif // USE_NEON
97 
98     } else /* constexpr */ {
99         T ret;
100         auto &[retval] = ret;  // single-member struct
101         if constexpr (std::is_array_v<decltype(retval)>) {
102 #pragma unroll
103             for (auto& val : retval) {
104                 val = vdupn<std::decay_t<decltype(val)>>(f);
105             }
106             return ret;
107         } else /* constexpr */ {
108              auto &[r1, r2] = retval;
109              using r1_type = std::decay_t<decltype(r1)>;
110              using r2_type = std::decay_t<decltype(r2)>;
111              r1 = vdupn<r1_type>(f);
112              r2 = vdupn<r2_type>(f);
113              return ret;
114         }
115     }
116 }
117 
118 // load from float pointer.
119 template<typename T, typename F>
vld1(const F * f)120 static inline T vld1(const F *f) {
121     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
122         return *f;
123 
124 #ifdef USE_NEON
125     } else if constexpr (std::is_same_v<T, float32x2_t>) {
126         return vld1_f32(f);
127     } else if constexpr (std::is_same_v<T, float32x4_t>) {
128         return vld1q_f32(f);
129 #if defined(__aarch64__)
130     } else if constexpr (std::is_same_v<T, float64x2_t>) {
131         return vld1q_f64(f);
132 #endif
133 #endif // USE_NEON
134 
135     } else /* constexpr */ {
136         T ret;
137         auto &[retval] = ret;  // single-member struct
138         if constexpr (std::is_array_v<decltype(retval)>) {
139             using element_type = std::decay_t<decltype(retval[0])>;
140             constexpr size_t subelements = sizeof(element_type) / sizeof(F);
141 #pragma unroll
142             for (size_t i = 0; i < std::size(retval); ++i) {
143                 retval[i] = vld1<element_type>(f);
144                 f += subelements;
145             }
146             return ret;
147         } else /* constexpr */ {
148              auto &[r1, r2] = retval;
149              using r1_type = std::decay_t<decltype(r1)>;
150              using r2_type = std::decay_t<decltype(r2)>;
151              r1 = vld1<r1_type>(f);
152              f += sizeof(r1) / sizeof(F);
153              r2 = vld1<r2_type>(f);
154              return ret;
155         }
156     }
157 }
158 
159 // fused multiply-add a + b * c
160 template<typename T>
vmla(T a,T b,T c)161 static inline T vmla(T a, T b, T c) {
162     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
163         return a + b * c;
164 
165 #ifdef USE_NEON
166     } else if constexpr (std::is_same_v<T, float32x2_t>) {
167         return vmla_f32(a, b, c);
168     } else if constexpr (std::is_same_v<T, float32x4_t>) {
169         return vmlaq_f32(a, b, c);
170 #if defined(__aarch64__)
171     } else if constexpr (std::is_same_v<T, float64x2_t>) {
172         return vmlaq_f64(a, b, c);
173 #endif
174 #endif // USE_NEON
175 
176     } else /* constexpr */ {
177         T ret;
178         auto &[retval] = ret;  // single-member struct
179         const auto &[aval] = a;
180         const auto &[bval] = b;
181         const auto &[cval] = c;
182         if constexpr (std::is_array_v<decltype(retval)>) {
183 #pragma unroll
184             for (size_t i = 0; i < std::size(aval); ++i) {
185                 retval[i] = vmla(aval[i], bval[i], cval[i]);
186             }
187             return ret;
188         } else /* constexpr */ {
189              auto &[r1, r2] = retval;
190              const auto &[a1, a2] = aval;
191              const auto &[b1, b2] = bval;
192              const auto &[c1, c2] = cval;
193              r1 = vmla(a1, b1, c1);
194              r2 = vmla(a2, b2, c2);
195              return ret;
196         }
197     }
198 }
199 
200 // multiply a * b
201 template<typename T>
vmul(T a,T b)202 static inline T vmul(T a, T b) {
203     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
204         return a * b;
205 
206 #ifdef USE_NEON
207     } else if constexpr (std::is_same_v<T, float32x2_t>) {
208         return vmul_f32(a, b);
209     } else if constexpr (std::is_same_v<T, float32x4_t>) {
210         return vmulq_f32(a, b);
211 #if defined(__aarch64__)
212     } else if constexpr (std::is_same_v<T, float64x2_t>) {
213         return vmulq_f64(a, b);
214 #endif
215 #endif // USE_NEON
216 
217     } else /* constexpr */ {
218         T ret;
219         auto &[retval] = ret;  // single-member struct
220         const auto &[aval] = a;
221         const auto &[bval] = b;
222         if constexpr (std::is_array_v<decltype(retval)>) {
223 #pragma unroll
224             for (size_t i = 0; i < std::size(aval); ++i) {
225                 retval[i] = vmul(aval[i], bval[i]);
226             }
227             return ret;
228         } else /* constexpr */ {
229              auto &[r1, r2] = retval;
230              const auto &[a1, a2] = aval;
231              const auto &[b1, b2] = bval;
232              r1 = vmul(a1, b1);
233              r2 = vmul(a2, b2);
234              return ret;
235         }
236     }
237 }
238 
239 // negate
240 template<typename T>
vneg(T f)241 static inline T vneg(T f) {
242     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
243         return -f;
244 
245 #ifdef USE_NEON
246     } else if constexpr (std::is_same_v<T, float32x2_t>) {
247         return vneg_f32(f);
248     } else if constexpr (std::is_same_v<T, float32x4_t>) {
249         return vnegq_f32(f);
250 #if defined(__aarch64__)
251     } else if constexpr (std::is_same_v<T, float64x2_t>) {
252         return vnegq_f64(f);
253 #endif
254 #endif // USE_NEON
255 
256     } else /* constexpr */ {
257         T ret;
258         auto &[retval] = ret;  // single-member struct
259         const auto &[fval] = f;
260         if constexpr (std::is_array_v<decltype(retval)>) {
261 #pragma unroll
262             for (size_t i = 0; i < std::size(fval); ++i) {
263                 retval[i] = vneg(fval[i]);
264             }
265             return ret;
266         } else /* constexpr */ {
267              auto &[r1, r2] = retval;
268              const auto &[f1, f2] = fval;
269              r1 = vneg(f1);
270              r2 = vneg(f2);
271              return ret;
272         }
273     }
274 }
275 
276 // store to float pointer.
277 template<typename T, typename F>
vst1(F * f,T a)278 static inline void vst1(F *f, T a) {
279     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
280         *f = a;
281 
282 #ifdef USE_NEON
283     } else if constexpr (std::is_same_v<T, float32x2_t>) {
284         return vst1_f32(f, a);
285     } else if constexpr (std::is_same_v<T, float32x4_t>) {
286         return vst1q_f32(f, a);
287 #if defined(__aarch64__)
288     } else if constexpr (std::is_same_v<T, float64x2_t>) {
289         return vst1q_f64(f, a);
290 #endif
291 #endif // USE_NEON
292 
293     } else /* constexpr */ {
294         const auto &[aval] = a;
295         if constexpr (std::is_array_v<decltype(aval)>) {
296             constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
297 #pragma unroll
298             for (size_t i = 0; i < std::size(aval); ++i) {
299                 vst1(f, aval[i]);
300                 f += subelements;
301             }
302         } else /* constexpr */ {
303              const auto &[a1, a2] = aval;
304              vst1(f, a1);
305              f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
306              vst1(f, a2);
307         }
308     }
309 }
310 
311 } // namespace android::audio_utils::intrinsics
312 
313 #pragma pop_macro("USE_NEON")
314 
315 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
316