/*
 * Vector math abstractions.
 *
 * Copyright (c) 2019, Arm Limited.
 * SPDX-License-Identifier: MIT
 */

#ifndef _V_MATH_H
#define _V_MATH_H

#ifndef WANT_VMATH
/* Enable the build of vector math code.  */
# define WANT_VMATH 1
#endif
#if WANT_VMATH

/* The goal of this header is to allow vector and scalar
   build of the same algorithm, the provided intrinsic
   wrappers are also vector length agnostic so they can
   be implemented for SVE too (or other simd architectures)
   and then the code should work on those targets too.  */

#if SCALAR
#define V_NAME(x) __s_##x
#elif VPCS && __aarch64__
#define V_NAME(x) __vn_##x
#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
#else
#define V_NAME(x) __v_##x
#endif

#ifndef VPCS_ATTR
#define VPCS_ATTR
#endif
#ifndef VPCS_ALIAS
#define VPCS_ALIAS
#endif

#include <stdint.h>
#include "math_config.h"

typedef float f32_t;
typedef uint32_t u32_t;
typedef int32_t s32_t;
typedef double f64_t;
typedef uint64_t u64_t;
typedef int64_t s64_t;

/* reinterpret as type1 from type2.  */
static inline u32_t
as_u32_f32 (f32_t x)
{
  union { f32_t f; u32_t u; } r = {x};
  return r.u;
}
static inline f32_t
as_f32_u32 (u32_t x)
{
  union { u32_t u; f32_t f; } r = {x};
  return r.f;
}
static inline s32_t
as_s32_u32 (u32_t x)
{
  union { u32_t u; s32_t i; } r = {x};
  return r.i;
}
static inline u32_t
as_u32_s32 (s32_t x)
{
  union { s32_t i; u32_t u; } r = {x};
  return r.u;
}
static inline u64_t
as_u64_f64 (f64_t x)
{
  union { f64_t f; u64_t u; } r = {x};
  return r.u;
}
static inline f64_t
as_f64_u64 (u64_t x)
{
  union { u64_t u; f64_t f; } r = {x};
  return r.f;
}
static inline s64_t
as_s64_u64 (u64_t x)
{
  union { u64_t u; s64_t i; } r = {x};
  return r.i;
}
static inline u64_t
as_u64_s64 (s64_t x)
{
  union { s64_t i; u64_t u; } r = {x};
  return r.u;
}

#if SCALAR
#define V_SUPPORTED 1
typedef f32_t v_f32_t;
typedef u32_t v_u32_t;
typedef s32_t v_s32_t;
typedef f64_t v_f64_t;
typedef u64_t v_u64_t;
typedef s64_t v_s64_t;

static inline int
v_lanes32 (void)
{
  return 1;
}

static inline v_f32_t
v_f32 (f32_t x)
{
  return x;
}
static inline v_u32_t
v_u32 (u32_t x)
{
  return x;
}
static inline v_s32_t
v_s32 (s32_t x)
{
  return x;
}

static inline f32_t
v_get_f32 (v_f32_t x, int i)
{
  return x;
}
static inline u32_t
v_get_u32 (v_u32_t x, int i)
{
  return x;
}
static inline s32_t
v_get_s32 (v_s32_t x, int i)
{
  return x;
}

static inline void
v_set_f32 (v_f32_t *x, int i, f32_t v)
{
  *x = v;
}
static inline void
v_set_u32 (v_u32_t *x, int i, u32_t v)
{
  *x = v;
}
static inline void
v_set_s32 (v_s32_t *x, int i, s32_t v)
{
  *x = v;
}

/* true if any elements of a v_cond result is non-zero.  */
static inline int
v_any_u32 (v_u32_t x)
{
  return x != 0;
}
/* to wrap the result of relational operators.  */
static inline v_u32_t
v_cond_u32 (v_u32_t x)
{
  return x ? -1 : 0;
}
static inline v_f32_t
v_abs_f32 (v_f32_t x)
{
  return __builtin_fabsf (x);
}
static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
  return __builtin_fmaf (x, y, z);
}
static inline v_f32_t
v_round_f32 (v_f32_t x)
{
  return __builtin_roundf (x);
}
static inline v_s32_t
v_round_s32 (v_f32_t x)
{
  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
}
/* convert to type1 from type2.  */
static inline v_f32_t
v_to_f32_s32 (v_s32_t x)
{
  return x;
}
static inline v_f32_t
v_to_f32_u32 (v_u32_t x)
{
  return x;
}
/* reinterpret as type1 from type2.  */
static inline v_u32_t
v_as_u32_f32 (v_f32_t x)
{
  union { v_f32_t f; v_u32_t u; } r = {x};
  return r.u;
}
static inline v_f32_t
v_as_f32_u32 (v_u32_t x)
{
  union { v_u32_t u; v_f32_t f; } r = {x};
  return r.f;
}
static inline v_s32_t
v_as_s32_u32 (v_u32_t x)
{
  union { v_u32_t u; v_s32_t i; } r = {x};
  return r.i;
}
static inline v_u32_t
v_as_u32_s32 (v_s32_t x)
{
  union { v_s32_t i; v_u32_t u; } r = {x};
  return r.u;
}
static inline v_f32_t
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
{
  return tab[idx];
}
static inline v_u32_t
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
{
  return tab[idx];
}
static inline v_f32_t
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
{
  return f (x);
}
static inline v_f32_t
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
	     v_u32_t p)
{
  return f (x1, x2);
}

static inline int
v_lanes64 (void)
{
  return 1;
}
static inline v_f64_t
v_f64 (f64_t x)
{
  return x;
}
static inline v_u64_t
v_u64 (u64_t x)
{
  return x;
}
static inline v_s64_t
v_s64 (s64_t x)
{
  return x;
}
static inline f64_t
v_get_f64 (v_f64_t x, int i)
{
  return x;
}
static inline void
v_set_f64 (v_f64_t *x, int i, f64_t v)
{
  *x = v;
}
/* true if any elements of a v_cond result is non-zero.  */
static inline int
v_any_u64 (v_u64_t x)
{
  return x != 0;
}
/* to wrap the result of relational operators.  */
static inline v_u64_t
v_cond_u64 (v_u64_t x)
{
  return x ? -1 : 0;
}
static inline v_f64_t
v_abs_f64 (v_f64_t x)
{
  return __builtin_fabs (x);
}
static inline v_f64_t
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
{
  return __builtin_fma (x, y, z);
}
static inline v_f64_t
v_round_f64 (v_f64_t x)
{
  return __builtin_round (x);
}
static inline v_s64_t
v_round_s64 (v_f64_t x)
{
  return __builtin_lround (x); /* relies on -fno-math-errno.  */
}
/* convert to type1 from type2.  */
static inline v_f64_t
v_to_f64_s64 (v_s64_t x)
{
  return x;
}
static inline v_f64_t
v_to_f64_u64 (v_u64_t x)
{
  return x;
}
/* reinterpret as type1 from type2.  */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
{
  union { v_f64_t f; v_u64_t u; } r = {x};
  return r.u;
}
static inline v_f64_t
v_as_f64_u64 (v_u64_t x)
{
  union { v_u64_t u; v_f64_t f; } r = {x};
  return r.f;
}
static inline v_s64_t
v_as_s64_u64 (v_u64_t x)
{
  union { v_u64_t u; v_s64_t i; } r = {x};
  return r.i;
}
static inline v_u64_t
v_as_u64_s64 (v_s64_t x)
{
  union { v_s64_t i; v_u64_t u; } r = {x};
  return r.u;
}
static inline v_f64_t
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
{
  return tab[idx];
}
static inline v_u64_t
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
{
  return tab[idx];
}
static inline v_f64_t
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
{
  return f (x);
}

#elif __aarch64__
#define V_SUPPORTED 1
#include <arm_neon.h>
typedef float32x4_t v_f32_t;
typedef uint32x4_t v_u32_t;
typedef int32x4_t v_s32_t;
typedef float64x2_t v_f64_t;
typedef uint64x2_t v_u64_t;
typedef int64x2_t v_s64_t;

static inline int
v_lanes32 (void)
{
  return 4;
}

static inline v_f32_t
v_f32 (f32_t x)
{
  return (v_f32_t){x, x, x, x};
}
static inline v_u32_t
v_u32 (u32_t x)
{
  return (v_u32_t){x, x, x, x};
}
static inline v_s32_t
v_s32 (s32_t x)
{
  return (v_s32_t){x, x, x, x};
}

static inline f32_t
v_get_f32 (v_f32_t x, int i)
{
  return x[i];
}
static inline u32_t
v_get_u32 (v_u32_t x, int i)
{
  return x[i];
}
static inline s32_t
v_get_s32 (v_s32_t x, int i)
{
  return x[i];
}

static inline void
v_set_f32 (v_f32_t *x, int i, f32_t v)
{
  (*x)[i] = v;
}
static inline void
v_set_u32 (v_u32_t *x, int i, u32_t v)
{
  (*x)[i] = v;
}
static inline void
v_set_s32 (v_s32_t *x, int i, s32_t v)
{
  (*x)[i] = v;
}

/* true if any elements of a v_cond result is non-zero.  */
static inline int
v_any_u32 (v_u32_t x)
{
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
/* to wrap the result of relational operators.  */
static inline v_u32_t
v_cond_u32 (v_u32_t x)
{
  return x;
}
static inline v_f32_t
v_abs_f32 (v_f32_t x)
{
  return vabsq_f32 (x);
}
static inline v_f32_t
v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
{
  return vfmaq_f32 (z, x, y);
}
static inline v_f32_t
v_round_f32 (v_f32_t x)
{
  return vrndaq_f32 (x);
}
static inline v_s32_t
v_round_s32 (v_f32_t x)
{
  return vcvtaq_s32_f32 (x);
}
/* convert to type1 from type2.  */
static inline v_f32_t
v_to_f32_s32 (v_s32_t x)
{
  return (v_f32_t){x[0], x[1], x[2], x[3]};
}
static inline v_f32_t
v_to_f32_u32 (v_u32_t x)
{
  return (v_f32_t){x[0], x[1], x[2], x[3]};
}
/* reinterpret as type1 from type2.  */
static inline v_u32_t
v_as_u32_f32 (v_f32_t x)
{
  union { v_f32_t f; v_u32_t u; } r = {x};
  return r.u;
}
static inline v_f32_t
v_as_f32_u32 (v_u32_t x)
{
  union { v_u32_t u; v_f32_t f; } r = {x};
  return r.f;
}
static inline v_s32_t
v_as_s32_u32 (v_u32_t x)
{
  union { v_u32_t u; v_s32_t i; } r = {x};
  return r.i;
}
static inline v_u32_t
v_as_u32_s32 (v_s32_t x)
{
  union { v_s32_t i; v_u32_t u; } r = {x};
  return r.u;
}
static inline v_f32_t
v_lookup_f32 (const f32_t *tab, v_u32_t idx)
{
  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline v_u32_t
v_lookup_u32 (const u32_t *tab, v_u32_t idx)
{
  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
}
static inline v_f32_t
v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
{
  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
}
static inline v_f32_t
v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
	     v_u32_t p)
{
  return (
    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
}

static inline int
v_lanes64 (void)
{
  return 2;
}
static inline v_f64_t
v_f64 (f64_t x)
{
  return (v_f64_t){x, x};
}
static inline v_u64_t
v_u64 (u64_t x)
{
  return (v_u64_t){x, x};
}
static inline v_s64_t
v_s64 (s64_t x)
{
  return (v_s64_t){x, x};
}
static inline f64_t
v_get_f64 (v_f64_t x, int i)
{
  return x[i];
}
static inline void
v_set_f64 (v_f64_t *x, int i, f64_t v)
{
  (*x)[i] = v;
}
/* true if any elements of a v_cond result is non-zero.  */
static inline int
v_any_u64 (v_u64_t x)
{
  /* assume elements in x are either 0 or -1u.  */
  return vpaddd_u64 (x) != 0;
}
/* to wrap the result of relational operators.  */
static inline v_u64_t
v_cond_u64 (v_u64_t x)
{
  return x;
}
static inline v_f64_t
v_abs_f64 (v_f64_t x)
{
  return vabsq_f64 (x);
}
static inline v_f64_t
v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
{
  return vfmaq_f64 (z, x, y);
}
static inline v_f64_t
v_round_f64 (v_f64_t x)
{
  return vrndaq_f64 (x);
}
static inline v_s64_t
v_round_s64 (v_f64_t x)
{
  return vcvtaq_s64_f64 (x);
}
/* convert to type1 from type2.  */
static inline v_f64_t
v_to_f64_s64 (v_s64_t x)
{
  return (v_f64_t){x[0], x[1]};
}
static inline v_f64_t
v_to_f64_u64 (v_u64_t x)
{
  return (v_f64_t){x[0], x[1]};
}
/* reinterpret as type1 from type2.  */
static inline v_u64_t
v_as_u64_f64 (v_f64_t x)
{
  union { v_f64_t f; v_u64_t u; } r = {x};
  return r.u;
}
static inline v_f64_t
v_as_f64_u64 (v_u64_t x)
{
  union { v_u64_t u; v_f64_t f; } r = {x};
  return r.f;
}
static inline v_s64_t
v_as_s64_u64 (v_u64_t x)
{
  union {  v_u64_t u; v_s64_t i; } r = {x};
  return r.i;
}
static inline v_u64_t
v_as_u64_s64 (v_s64_t x)
{
  union { v_s64_t i; v_u64_t u; } r = {x};
  return r.u;
}
static inline v_f64_t
v_lookup_f64 (const f64_t *tab, v_u64_t idx)
{
  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
}
static inline v_u64_t
v_lookup_u64 (const u64_t *tab, v_u64_t idx)
{
  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
}
static inline v_f64_t
v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
{
  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
}
#endif

#endif
#endif