android-12.0.0_r2/s

/*
 * Copyright 2020 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
#define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H

#include <array>  // std::size
#include <type_traits>

/*
  The intrinsics utility library contain helper functions for wide width DSP support.
  We use templated types to allow testing from scalar to vector values.

  See the Eigen project for general abstracted linear algebra acceleration.
  http://eigen.tuxfamily.org/
*/

// We conditionally include neon optimizations for ARM devices
#pragma push_macro("USE_NEON")
#undef USE_NEON

#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#define USE_NEON
#endif

namespace android::audio_utils::intrinsics {

// For static assert(false) we need a template version to avoid early failure.
// See: https://stackoverflow.com/questions/51523965/template-dependent-false
template <typename T>
inline constexpr bool dependent_false_v = false;

// Type of array embedded in a struct that is usable in the Neon template functions below.
// This type must satisfy std::is_array_v<>.
template<typename T, size_t N>
struct internal_array_t {
    T v[N];
};

/*
  Generalized template functions for the Neon instruction set.

  See here for some general comments from ARM.
  https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization

  Notes:
  1) We provide scalar equivalents which are compilable even on non-ARM processors.
  2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
  3) NEON double SIMD acceleration is only available on 64 bit architectures.
     On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.

  We create a generic Neon acceleration to be applied to a composite type.

  The type follows the following compositional rules for simplicity:
      1) must be a primitive floating point type.
      2) must be a NEON data type.
      3) must be a struct with one member, either
           a) an array of types 1-3.
           b) a cons-pair struct of 2 possibly different members of types 1-3.

  Examples of possible struct definitions:
  using alternative_2_t = struct { struct { float a; float b; } s; };
  using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
  using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
*/

// duplicate float into all elements.
template<typename T, typename F>
static inline T vdupn(F f) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return f;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vdup_n_f32(f);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vdupq_n_f32(f);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vdupq_n_f64(f);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
        if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
            for (auto& val : retval) {
                val = vdupn<std::decay_t<decltype(val)>>(f);
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
             using r1_type = std::decay_t<decltype(r1)>;
             using r2_type = std::decay_t<decltype(r2)>;
             r1 = vdupn<r1_type>(f);
             r2 = vdupn<r2_type>(f);
             return ret;
        }
    }
}

// load from float pointer.
template<typename T, typename F>
static inline T vld1(const F *f) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return *f;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vld1_f32(f);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vld1q_f32(f);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vld1q_f64(f);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
        if constexpr (std::is_array_v<decltype(retval)>) {
            using element_type = std::decay_t<decltype(retval[0])>;
            constexpr size_t subelements = sizeof(element_type) / sizeof(F);
#pragma unroll
            for (size_t i = 0; i < std::size(retval); ++i) {
                retval[i] = vld1<element_type>(f);
                f += subelements;
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
             using r1_type = std::decay_t<decltype(r1)>;
             using r2_type = std::decay_t<decltype(r2)>;
             r1 = vld1<r1_type>(f);
             f += sizeof(r1) / sizeof(F);
             r2 = vld1<r2_type>(f);
             return ret;
        }
    }
}

// fused multiply-add a + b * c
template<typename T>
static inline T vmla(T a, T b, T c) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return a + b * c;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vmla_f32(a, b, c);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vmlaq_f32(a, b, c);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vmlaq_f64(a, b, c);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
        const auto &[aval] = a;
        const auto &[bval] = b;
        const auto &[cval] = c;
        if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
            for (size_t i = 0; i < std::size(aval); ++i) {
                retval[i] = vmla(aval[i], bval[i], cval[i]);
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
             const auto &[a1, a2] = aval;
             const auto &[b1, b2] = bval;
             const auto &[c1, c2] = cval;
             r1 = vmla(a1, b1, c1);
             r2 = vmla(a2, b2, c2);
             return ret;
        }
    }
}

// multiply a * b
template<typename T>
static inline T vmul(T a, T b) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return a * b;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vmul_f32(a, b);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vmulq_f32(a, b);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vmulq_f64(a, b);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
        const auto &[aval] = a;
        const auto &[bval] = b;
        if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
            for (size_t i = 0; i < std::size(aval); ++i) {
                retval[i] = vmul(aval[i], bval[i]);
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
             const auto &[a1, a2] = aval;
             const auto &[b1, b2] = bval;
             r1 = vmul(a1, b1);
             r2 = vmul(a2, b2);
             return ret;
        }
    }
}

// negate
template<typename T>
static inline T vneg(T f) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return -f;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vneg_f32(f);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vnegq_f32(f);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vnegq_f64(f);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
        const auto &[fval] = f;
        if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
            for (size_t i = 0; i < std::size(fval); ++i) {
                retval[i] = vneg(fval[i]);
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
             const auto &[f1, f2] = fval;
             r1 = vneg(f1);
             r2 = vneg(f2);
             return ret;
        }
    }
}

// store to float pointer.
template<typename T, typename F>
static inline void vst1(F *f, T a) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        *f = a;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
        return vst1_f32(f, a);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vst1q_f32(f, a);
#if defined(__aarch64__)
    } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vst1q_f64(f, a);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        const auto &[aval] = a;
        if constexpr (std::is_array_v<decltype(aval)>) {
            constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
#pragma unroll
            for (size_t i = 0; i < std::size(aval); ++i) {
                vst1(f, aval[i]);
                f += subelements;
            }
        } else /* constexpr */ {
             const auto &[a1, a2] = aval;
             vst1(f, a1);
             f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
             vst1(f, a2);
        }
    }
}

} // namespace android::audio_utils::intrinsics

#pragma pop_macro("USE_NEON")

#endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H