1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  * Unless required by applicable law or agreed to in writing, software
10  *
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
18 #define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
19 
20 #include "intrinsic_utils.h"
21 
22 #include <array>
23 #include <cmath>
24 #include <functional>
25 #include <utility>
26 #include <vector>
27 
28 #include <assert.h>
29 
30 // We conditionally include neon optimizations for ARM devices
31 #pragma push_macro("USE_NEON")
32 #undef USE_NEON
33 
34 #if defined(__ARM_NEON__) || defined(__aarch64__)
35 #include <arm_neon.h>
36 #define USE_NEON
37 #endif
38 
39 namespace android::audio_utils {
40 
41 static constexpr size_t kBiquadNumCoefs  = 5;
42 static constexpr size_t kBiquadNumDelays = 2;
43 
44 namespace details {
45 // Helper methods for constructing a constexpr array of function pointers.
46 // As function pointers are efficient and have no constructor/destructor
47 // this is preferred over std::function.
48 //
49 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
50 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)51 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
52     using first_t = decltype(&F<0, false>::func);  // type from function
53     using result_t = std::array<first_t, sizeof...(Is)>;   // type of array
54     return result_t{{F<Is, SC>::func...}};      // initialize with functions.
55 }
56 
57 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()58 static inline constexpr auto make_functional_array() {
59     return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
60 }
61 
62 // Returns true if the poles are stable for a Biquad.
63 template <typename D>
isStable(const D & a1,const D & a2)64 static inline constexpr bool isStable(const D& a1, const D& a2) {
65     return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
66 }
67 
68 // Simplifies Biquad coefficients.
69 // TODO: consider matched pole/zero cancellation.
70 //       consider subnormal elimination for Intel processors.
71 template <typename D, typename T>
reduceCoefficients(const T & coef)72 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
73     std::array<D, kBiquadNumCoefs> lcoef;
74     if (coef.size() == kBiquadNumCoefs + 1) {
75         // General form of Biquad.
76         // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
77         size_t offset = 0;
78         for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
79         assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
80 
81         // Normalize 6 coefficients to 5 for storage.
82         lcoef[0] = coef[offset] / coef[offset + 3];
83         for (size_t i = 1; i + offset < 3; ++i) {
84             lcoef[i] = coef[i + offset] / coef[offset + 3];
85             lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
86          }
87     } else if (coef.size() == kBiquadNumCoefs) {
88         std::copy(coef.begin(), coef.end(), lcoef.begin());
89     } else {
90         assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
91     }
92     return lcoef;
93 }
94 
95 // Sets a container of coefficients to storage.
96 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)97 static inline void setCoefficients(
98         DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
99     auto lcoef = reduceCoefficients<D, T>(coef);
100     // replicate as needed
101     for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
102         for (size_t j = 0; j < channelCount; ++j) {
103             dest[i * stride + offset + j] = lcoef[i];
104         }
105     }
106 }
107 
108 // For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
109 // determined in a constexpr fashion for optimization.
110 
111 // Helper which takes a stride to allow column processing of interleaved audio streams.
112 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_1fast(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)113 void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
114                          size_t channelCount, D *delays, const D *coefs, size_t localStride) {
115 #if defined(__i386__) || defined(__x86_x64__)
116     D delta = std::numeric_limits<float>::min() * (1 << 24);
117 #endif
118     D b0, b1, b2, negativeA1, negativeA2;
119 
120     if constexpr (SAME_COEF_PER_CHANNEL) {
121         b0 = coefs[0];
122         b1 = coefs[1];
123         b2 = coefs[2];
124         negativeA1 = -coefs[3];
125         negativeA2 = -coefs[4];
126     }
127     for (size_t i = 0; i < channelCount; ++i) {
128         if constexpr (!SAME_COEF_PER_CHANNEL) {
129             b0 = coefs[0];
130             b1 = coefs[localStride];
131             b2 = coefs[2 * localStride];
132             negativeA1 = -coefs[3 * localStride];
133             negativeA2 = -coefs[4 * localStride];
134             ++coefs;
135         }
136 
137         D s1n1 = delays[0];
138         D s2n1 = delays[localStride];
139         const D *input = &in[i];
140         D *output = &out[i];
141         for (size_t j = frames; j > 0; --j) {
142             // Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
143             // this is not a problem with the ARM platform. The delta will not affect the
144             // precision of the result.
145 #if defined(__i386__) || defined(__x86_x64__)
146             const D xn = *input + delta;
147 #else
148             const D xn = *input;
149 #endif
150             D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
151             s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
152             s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
153 
154             input += stride;
155 
156             *output = yn;
157             output += stride;
158 
159 #if defined(__i386__) || defined(__x86_x64__)
160             delta = -delta;
161 #endif
162         }
163         delays[0] = s1n1;
164         delays[localStride] = s2n1;
165         ++delays;
166     }
167 }
168 
169 // Helper function to zero channels in the input buffer.
170 // This is used for the degenerate coefficient case which results in all zeroes.
171 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)172 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
173     if (stride == channelCount) {
174         memset(out, 0, sizeof(float) * frames * channelCount);
175     } else {
176         for (size_t i = 0; i < frames; i++) {
177             memset(out, 0, sizeof(float) * channelCount);
178             out += stride;
179         }
180     }
181 }
182 
183 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_fast(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)184 void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
185         size_t channelCount, D *delays, const D *coefs, size_t localStride) {
186     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
187         zeroChannels(out, frames, stride, channelCount);
188         return;
189     }
190     biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
191             out, in, frames, stride, channelCount, delays, coefs, localStride);
192 }
193 
194 #ifdef USE_NEON
195 
196 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_neon_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)197 void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
198         size_t channelCount, F *delays, const F *coefs, size_t localStride) {
199     using namespace android::audio_utils::intrinsics;
200 
201     constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
202     T b0, b1, b2, negativeA1, negativeA2;
203     if constexpr (SAME_COEF_PER_CHANNEL) {
204         b0 = vdupn<T>(coefs[0]);
205         b1 = vdupn<T>(coefs[1]);
206         b2 = vdupn<T>(coefs[2]);
207         negativeA1 = vneg(vdupn<T>(coefs[3]));
208         negativeA2 = vneg(vdupn<T>(coefs[4]));
209     }
210     for (size_t i = 0; i < channelCount; i += elements) {
211         if constexpr (!SAME_COEF_PER_CHANNEL) {
212             b0 = vld1<T>(coefs);
213             b1 = vld1<T>(coefs + localStride);
214             b2 = vld1<T>(coefs + localStride * 2);
215             negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
216             negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
217             coefs += elements;
218         }
219         T s1 = vld1<T>(&delays[0]);
220         T s2 = vld1<T>(&delays[localStride]);
221         const F *input = &in[i];
222         F *output = &out[i];
223         for (size_t j = frames; j > 0; --j) {
224             T xn = vld1<T>(input);
225             T yn = s1;
226 
227             if constexpr (OCCUPANCY >> 0 & 1) {
228                 yn = vmla(yn, b0, xn);
229             }
230             s1 = s2;
231             if constexpr (OCCUPANCY >> 3 & 1) {
232                 s1 = vmla(s1, negativeA1, yn);
233             }
234             if constexpr (OCCUPANCY >> 1 & 1) {
235                 s1 = vmla(s1, b1, xn);
236             }
237             if constexpr (OCCUPANCY >> 2 & 1) {
238                 s2 = vmul(b2, xn);
239             } else {
240                 s2 = vdupn<T>(0.f);
241             }
242             if constexpr (OCCUPANCY >> 4 & 1) {
243                 s2 = vmla(s2, negativeA2, yn);
244             }
245 
246             input += stride;
247             vst1(output, yn);
248             output += stride;
249         }
250         vst1(&delays[0], s1);
251         vst1(&delays[localStride], s2);
252         delays += elements;
253     }
254 }
255 
256 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
257             case N: { \
258                 biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
259                         out + offset, in + offset, frames, stride, remaining, \
260                         delays + offset, c, localStride); \
261                 goto exit; \
262             }
263 
264 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_neon(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)265 void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
266         size_t channelCount, D *delays, const D *coefs, size_t localStride) {
267     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
268         zeroChannels(out, frames, stride, channelCount);
269         return;
270     }
271 
272     // Possible alternative intrinsic types for 2, 9, 15 float elements.
273     // using alt_2_t = struct {struct { float a; float b; } s; };
274     // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
275     // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
276 
277     for (size_t offset = 0; offset < channelCount; ) {
278         size_t remaining = channelCount - offset;
279         auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
280         if constexpr (std::is_same_v<D, float>) {
281             switch (remaining) {
282             default:
283                 if (remaining >= 16) {
284                     remaining &= ~15;
285                     biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
286                             out + offset, in + offset, frames, stride, remaining,
287                             delays + offset, c, localStride);
288                     offset += remaining;
289                     continue;
290                 }
291                 break;  // case 1 handled at bottom.
292             BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
293             BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
294             BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
295             BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
296             BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
297             BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
298             BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
299             // We choose the NEON intrinsic type over internal_array for 8 to
300             // check if there is any performance difference in benchmark (should be similar).
301             // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
302             BIQUAD_FILTER_CASE(8, float32x4x2_t)
303             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
304             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
305             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
306             BIQUAD_FILTER_CASE(4, float32x4_t)
307             // We choose the NEON intrinsic type over internal_array for 4 to
308             // check if there is any performance difference in benchmark (should be similar).
309             // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
310             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
311             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
312             }
313         } else if constexpr (std::is_same_v<D, double>) {
314 #if defined(__aarch64__)
315             switch (remaining) {
316             default:
317                 if (remaining >= 8) {
318                     remaining &= ~7;
319                     biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
320                               intrinsics::internal_array_t<double, 8>>(
321                             out + offset, in + offset, frames, stride, remaining,
322                             delays + offset, c, localStride);
323                     offset += remaining;
324                     continue;
325                 }
326                 break; // case 1 handled at bottom.
327             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
328             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
329             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
330             BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
331             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
332             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
333             };
334 #endif
335         }
336         // Essentially the code below is scalar, the same as
337         // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
338         // but formulated with NEON intrinsic-like call pattern.
339         biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
340                 out + offset, in + offset, frames, stride, remaining,
341                 delays + offset, c, localStride);
342         offset += remaining;
343     }
344     exit:;
345 }
346 
347 #endif // USE_NEON
348 
349 } // namespace details
350 
351 /**
352  * BiquadFilter
353  *
354  * A multichannel Biquad filter implementation of the following transfer function.
355  *
356  * \f[
357  *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
358  *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
359  * \f]
360  *
361  * <!--
362  *        b_0 + b_1 z^{-1} + b_2 z^{-2}
363  *  H(z)= -----------------------------
364  *        1 + a_1 z^{-1} + a_2 z^{-2}
365  * -->
366  *
367  *  Details:
368  *    1. The transposed direct type 2 implementation allows zeros to be computed
369  *       before poles in the internal state for improved filter precision and
370  *       better time-varying coefficient performance.
371  *    2. We optimize for zero coefficients using a compile-time generated function table.
372  *    3. We optimize for vector operations using column vector operations with stride
373  *       into interleaved audio data.
374  *    4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
375  *       negated form being slightly simpler for optimization (addition is commutative but
376  *       subtraction is not commutative).  This is to permit obtaining the coefficients
377  *       as a const reference.
378  *
379  *       Compatibility issue: Some Biquad libraries store the denominator coefficients
380  *       in negated form.  We explicitly negate before entering into the inner loop.
381  *    5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
382  *       coefficients.  See setCoefficients() below.
383  *
384  * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
385  *
386  * The Biquad filter update equation in transposed Direct form 2 is as follows:
387  *
388  * \f{eqnarray*}{
389  * y[n] &=& b0 * x[n] + s1[n - 1] \\
390  * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
391  * s2[n] &=& b2 * x[n] - a2 * y[n]
392  * \f}
393  *
394  * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
395  * contained in the internal vector mDelays[].  This is stored interleaved by channel.
396  *
397  * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
398  * we do not rely on strict C operator precedence and associativity here.
399  * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
400  *
401  * \param D type variable representing the data type, one of float or double.
402  *         The default is float.
403  * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
404  *         are shared between channels, or false if the Biquad coefficients
405  *         may differ between channels. The default is true.
406  */
407 template <typename D = float, bool SAME_COEF_PER_CHANNEL = true>
408 class BiquadFilter {
409 public:
410     template <typename T = std::array<D, kBiquadNumCoefs>>
411     explicit BiquadFilter(size_t channelCount,
412             const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)413             : mChannelCount(channelCount)
414             , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
415             , mDelays(channelCount * kBiquadNumDelays) {
416         setCoefficients(coefs, optimized);
417     }
418 
419     // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)420     BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
421         *this = other;
422     }
423 
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)424     BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
425         *this = std::move(other);
426     }
427 
428     // copy assignment
429     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
430             const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
431         mChannelCount = other.mChannelCount;
432         mCoefs = other.mCoefs;
433         mDelays = other.mDelays;
434         return *this;
435     }
436 
437     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
438             BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
439         mChannelCount = other.mChannelCount;
440         mCoefs = std::move(other.mCoefs);
441         mDelays = std::move(other.mDelays);
442         return *this;
443     }
444 
445     // operator overloads for equality tests
446     bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
447         return mChannelCount == other.mChannelCount
448                 && mCoefs == other.mCoefs
449                 && mDelays == other.mDelays;
450     }
451 
452     bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
453         return !operator==(other);
454     }
455 
456     /**
457      * \brief Sets filter coefficients
458      *
459      * \param coefs  pointer to the filter coefficients array.
460      * \param optimized whether to use processor optimized function (optional, defaults true).
461      * \return true if the BiquadFilter is stable, otherwise, return false.
462      *
463      * The input coefficients are interpreted in the following manner:
464      *
465      * If size of container is 5 (normalized Biquad):
466      * coefs[0] is b0,
467      * coefs[1] is b1,
468      * coefs[2] is b2,
469      * coefs[3] is a1,
470      * coefs[4] is a2.
471      *
472      * \f[
473      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
474      *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
475      * \f]
476      * <!--
477      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
478      *  H(z)= -----------------------------
479      *        1 + a_1 z^{-1} + a_2 z^{-2}
480      * -->
481      *
482      * If size of container is 6 (general Biquad):
483      * coefs[0] is b0,
484      * coefs[1] is b1,
485      * coefs[2] is b2,
486      * coefs[3] is a0,
487      * coefs[4] is a1,
488      * coefs[5] is a2.
489      *
490      * \f[
491      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
492      *               { a_0 + a_1 z^{-1} + a_2 z^{-2} }
493      * \f]
494      * <!--
495      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
496      *  H(z)= -----------------------------
497      *        a_0 + a_1 z^{-1} + a_2 z^{-2}
498      * -->
499      *
500      * The internal representation is a normalized Biquad.
501      */
502     template <typename T = std::array<D, kBiquadNumCoefs>>
503     bool setCoefficients(const T& coefs, bool optimized = true) {
504         if constexpr (SAME_COEF_PER_CHANNEL) {
505             details::setCoefficients<D, T>(
506                     mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
507         } else {
508             if (coefs.size() == mCoefs.size()) {
509                 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
510             } else {
511                 details::setCoefficients<D, T>(
512                         mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
513             }
514         }
515         setOptimization(optimized);
516         return isStable();
517     }
518 
519     /**
520      * Sets coefficients for one of the filter channels, specified by channelIndex.
521      *
522      * This method is only available if SAME_COEF_PER_CHANNEL is false.
523      *
524      * \param coefs the coefficients to set.
525      * \param channelIndex the particular channel index to set.
526      * \param optimized whether to use optimized function (optional, defaults true).
527      */
528     template <typename T = std::array<D, kBiquadNumCoefs>>
529     bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
530         static_assert(!SAME_COEF_PER_CHANNEL);
531 
532         details::setCoefficients<D, T>(
533                 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
534         setOptimization(optimized);
535         return isStable();
536     }
537 
538     /**
539      * Returns the coefficients as a const vector reference.
540      *
541      * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
542      * the coefficients are interleaved by channel.
543      */
getCoefficients()544     const std::vector<D>& getCoefficients() const {
545         return mCoefs;
546     }
547 
548     /**
549      * Returns true if the filter is stable.
550      *
551      * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
552      *        asserts if channelIndex >= channel count (zero based index).
553      */
554     bool isStable(size_t channelIndex = 0) const {
555         if constexpr (SAME_COEF_PER_CHANNEL) {
556             return details::isStable(mCoefs[3], mCoefs[4]);
557         } else {
558             assert(channelIndex < mChannelCount);
559             return details::isStable(
560                     mCoefs[3 * mChannelCount + channelIndex],
561                     mCoefs[4 * mChannelCount + channelIndex]);
562         }
563     }
564 
565     /**
566      * Updates the filter function based on processor optimization.
567      *
568      * \param optimized if true, enables Processor based optimization.
569      */
setOptimization(bool optimized)570     void setOptimization(bool optimized) {
571         // Determine which coefficients are nonzero as a bit field.
572         size_t category = 0;
573         for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
574             if constexpr (SAME_COEF_PER_CHANNEL) {
575                 category |= (mCoefs[i] != 0) << i;
576             } else {
577                 for (size_t j = 0; j < mChannelCount; ++j) {
578                     if (mCoefs[i * mChannelCount + j] != 0) {
579                         category |= 1 << i;
580                         break;
581                     }
582                 }
583             }
584         }
585 
586         // Select the proper filtering function from our array.
587         (void)optimized;                // avoid unused variable warning.
588         mFunc = mFilterFast[category];  // default if we don't have processor optimization.
589 
590 #ifdef USE_NEON
591         /* if constexpr (std::is_same_v<D, float>) */ {
592             if (optimized) {
593                 mFunc = mFilterNeon[category];
594             }
595         }
596 #endif
597     }
598 
599     /**
600      * \brief Filters the input data
601      *
602      * \param out     pointer to the output data
603      * \param in      pointer to the input data
604      * \param frames  number of audio frames to be processed
605      */
process(D * out,const D * in,size_t frames)606     void process(D* out, const D *in, size_t frames) {
607         process(out, in, frames, mChannelCount);
608     }
609 
610     /**
611      * \brief Filters the input data with stride
612      *
613      * \param out     pointer to the output data
614      * \param in      pointer to the input data
615      * \param frames  number of audio frames to be processed
616      * \param stride  the total number of samples associated with a frame, if not channelCount.
617      */
process(D * out,const D * in,size_t frames,size_t stride)618     void process(D* out, const D *in, size_t frames, size_t stride) {
619         assert(stride >= mChannelCount);
620         mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
621                 mCoefs.data(), mChannelCount);
622     }
623 
624     /**
625      * EXPERIMENTAL:
626      * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
627      *
628      * Instead of considering mChannel Biquads as one-per-input channel, this method treats
629      * the mChannel biquads as applied in sequence to a single 1D input stream,
630      * with the last channel count Biquad being applied first.
631      *
632      * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
633      *
634      * TODO: Make this code efficient for NEON and split the destination from the source.
635      *
636      * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
637      * sequentially applied, but in practice it is *MUCH* slower.
638      * On NEON, the data cannot be written then read in-place without incurring
639      * memory stall penalties.  A shifting NEON holding register is required to make this
640      * a practical improvement.
641      */
process1D(D * inout,size_t frames)642     void process1D(D* inout, size_t frames) {
643         size_t remaining = mChannelCount;
644 #ifdef USE_NEON
645         // We apply NEON acceleration striped with 4 filters (channels) at once.
646         // Filters operations commute, nevertheless we apply the filters in order.
647         if (frames >= 2 * mChannelCount) {
648             constexpr size_t channelBlock = 4;
649             for (; remaining >= channelBlock; remaining -= channelBlock) {
650                 const size_t baseIdx = remaining - channelBlock;
651                 // This is the 1D accelerated method.
652                 // prime the data pipe.
653                 for (size_t i = 0; i < channelBlock - 1; ++i) {
654                     size_t fromEnd = remaining - i - 1;
655                     auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
656                     auto delays = mDelays.data() + fromEnd;
657                     mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
658                             delays, coefs, mChannelCount);
659                 }
660 
661                 auto delays = mDelays.data() + baseIdx;
662                 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
663                 // Parallel processing - we use a sliding window doing channelBlock at once,
664                 // sliding one audio sample at a time.
665                 mFunc(inout, inout,
666                         frames - channelBlock + 1, 1 /* stride */, channelBlock,
667                         delays, coefs, mChannelCount);
668 
669                 // drain data pipe.
670                 for (size_t i = 1; i < channelBlock; ++i) {
671                     mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
672                             1 /* frames */, 1 /* stride */, channelBlock - i,
673                             delays, coefs, mChannelCount);
674                 }
675             }
676         }
677 #endif
678         // For short data sequences, we use the serial single channel logical equivalent
679         for (; remaining > 0; --remaining) {
680             size_t fromEnd = remaining - 1;
681             auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
682             mFunc(inout, inout,
683                     frames, 1 /* stride */, 1 /* channelCount */,
684                     mDelays.data() + fromEnd, coefs, mChannelCount);
685         }
686     }
687 
688     /**
689      * \brief Clears the delay elements
690      *
691      * This function clears the delay elements representing the filter state.
692      */
clear()693     void clear() {
694         std::fill(mDelays.begin(), mDelays.end(), 0.f);
695     }
696 
697     /**
698      * \brief Sets the internal delays from a vector
699      *
700      * For a multichannel stream, the delays are interleaved by channel:
701      * delays[2 * i + 0] is s1 of i-th channel,
702      * delays[2 * i + 1] is s2 of i-th channel,
703      * where index i runs from 0 to (mChannelCount - 1).
704      *
705      * \param delays reference to vector containing delays.
706      */
setDelays(std::vector<D> & delays)707     void setDelays(std::vector<D>& delays) {
708         assert(delays.size() == mDelays.size());
709         mDelays = std::move(delays);
710     }
711 
712     /**
713      * \brief Gets delay elements as a vector
714      *
715      * For a multichannel stream, the delays are interleaved by channel:
716      * delays[2 * i + 0] is s1 of i-th channel,
717      * delays[2 * i + 1] is s2 of i-th channel,
718      * where index i runs from 0 to (mChannelCount - 1).
719      *
720      * \return a const vector reference of delays.
721      */
getDelays()722     const std::vector<D>& getDelays() const {
723         return mDelays;
724     }
725 
726 private:
727     /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
728 
729     /*
730      * \var D mCoefs
731      * \brief Stores the filter coefficients
732      *
733      * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
734      * interleaved by channel.
735      */
736     std::vector<D> mCoefs;
737 
738     /**
739      * \var D mDelays
740      * \brief The delay state.
741      *
742      * The delays are stored channel interleaved in the following manner,
743      * mDelays[2 * i + 0] is s1 of i-th channel
744      * mDelays[2 * i + 1] is s2 of i-th channel
745      * index i runs from 0 to (mChannelCount - 1).
746      */
747     std::vector<D> mDelays;
748 
749     using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
750 
751     /**
752      * \var filter_func* mFunc
753      *
754      * The current filter function selected for the channel occupancy of the Biquad.
755      */
756     filter_func *mFunc;
757 
758     // Create a functional wrapper to feed "biquad_filter_fast" to
759     // make_functional_array() to populate the array.
760     //
761     // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
762     // b0 b1 b2 a1 a2  (from lsb to msb)
763     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
764     struct FuncWrap {
765         template<typename T>
nearestFuncWrap766         static constexpr size_t nearest() {
767             // Combine cases to both improve expected performance and reduce code space.
768             // Some occupancy masks provide worse performance than more occupied masks.
769             constexpr size_t required_occupancies[] = {
770                 1,  // constant scale
771                 3,  // single zero
772                 7,  // double zero
773                 9,  // single pole
774                 // 11, // first order IIR (unnecessary optimization, close enough to 31).
775                 27, // double pole + single zero
776                 31, // second order IIR (full Biquad)
777             };
778             if constexpr (OCCUPANCY < 32) {
779                 for (auto test : required_occupancies) {
780                     if ((OCCUPANCY & test) == OCCUPANCY) return test;
781                 }
782             } else {
783                 static_assert(intrinsics::dependent_false_v<T>);
784             }
785             return 0; // never gets here.
786         }
787 
funcFuncWrap788         static void func(D* out, const D *in, size_t frames, size_t stride,
789                 size_t channelCount, D *delays, const D *coef, size_t localStride) {
790             constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
791             details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
792                     out, in, frames, stride, channelCount, delays, coef, localStride);
793         }
794     };
795 
796     /**
797      * \var mFilterFast
798      *
799      * std::array of functions based on coefficient occupancy.
800      *
801      *  static inline constexpr std::array<filter_func*, M> mArray = {
802      *     biquad_filter_fast<0>,
803      *     biquad_filter_fast<1>,
804      *     biquad_filter_fast<2>,
805      *      ...
806      *     biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
807      *  };
808      *
809      * Every time the coefficients are changed, we select the processing function from
810      * this table.
811      */
812     static inline constexpr auto mFilterFast =
813             details::make_functional_array<
814                     FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
815 
816 #ifdef USE_NEON
817     // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
818     // b0 b1 b2 a1 a2  (from lsb to msb)
819 
820     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
821     struct FuncWrapNeon {
822         template<typename T>
nearestFuncWrapNeon823         static constexpr size_t nearest() {
824             // combine cases to both improve expected performance and reduce code space.
825             //
826             // This lists the occupancies we will specialize functions for.
827             constexpr size_t required_occupancies[] = {
828                 1,  // constant scale
829                 3,  // single zero
830                 7,  // double zero
831                 9,  // single pole
832                 11, // first order IIR
833                 27, // double pole + single zero
834                 31, // second order IIR (full Biquad)
835             };
836             if constexpr (OCCUPANCY < 32) {
837                 for (auto test : required_occupancies) {
838                     if ((OCCUPANCY & test) == OCCUPANCY) return test;
839                 }
840             } else {
841                 static_assert(intrinsics::dependent_false_v<T>);
842             }
843             return 0; // never gets here.
844         }
845 
funcFuncWrapNeon846         static void func(D* out, const D *in, size_t frames, size_t stride,
847                 size_t channelCount, D *delays, const D *coef, size_t localStride) {
848             constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
849             details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
850                     out, in, frames, stride, channelCount, delays, coef, localStride);
851         }
852     };
853 
854     // Neon optimized array of functions.
855     static inline constexpr auto mFilterNeon =
856             details::make_functional_array<
857                     FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
858 #endif // USE_NEON
859 
860 };
861 
862 } // namespace android::audio_utils
863 
864 #pragma pop_macro("USE_NEON")
865 
866 #endif  // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
867