/external/XNNPACK/src/math/ |
D | sqrt-neonfma-nr3fma.c | 29 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr3fma() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 36 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 40 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma() 43 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr3fma()
|
D | sqrt-neonfma-nr2fma1adj.c | 29 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 36 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 40 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma1adj() 47 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr2fma1adj()
|
D | sqrt-neonfma-nr2fma.c | 29 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr2fma() local 35 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 36 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma() 39 vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr2fma()
|
D | sqrt-avx512f-nr2fma.c | 30 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma() local 36 __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma() 37 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__avx512f_nr2fma() 40 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
|
D | sqrt-fma3-nr2fma.c | 30 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma() local 36 __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma() 37 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__fma3_nr2fma() 40 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
|
D | sqrt-avx512f-nr1fma1adj.c | 30 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() local 36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 37 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 44 vsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
|
D | sqrt-fma3-nr1fma1adj.c | 30 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma1adj() local 36 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 37 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj() 44 vsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_math_f32_sqrt__fma3_nr1fma1adj()
|
D | sqrt-neonfma-nr1rsqrts1fma1adj.c | 34 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() local 40 float32x4_t vresidual = vfmsq_f32(vhalf, vsqrtx, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 41 vhalfrsqrtx = vfmaq_f32(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 48 vsqrtx = vfmaq_f32(vsqrtx, vhalfrsqrtx, vadjustment); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
|
/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x16.c | 35 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 37 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 40 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 55 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 56 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 57 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 60 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
|
D | fma3-nr1fma1adj-x8.c | 36 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 37 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 38 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 41 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 55 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 56 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 57 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 60 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
|
D | avx512f-nr1fma1adj-x32.c | 66 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 67 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 68 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 71 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 88 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
|
D | fma3-nr1fma1adj-x16.c | 67 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 68 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 69 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 72 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 86 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 87 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 88 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 91 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
|
D | fma3-nr1fma1adj-x24.c | 77 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 78 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 79 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 82 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 96 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 97 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 98 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 101 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
|
D | avx512f-nr1fma1adj-x48.c | 76 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local 77 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 78 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 81 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 96 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local 97 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 98 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 101 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x64.c | 86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local 87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 88 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 91 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 106 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local 107 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 108 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 111 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
|
D | fma3-nr1fma1adj-x32.c | 87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local 88 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 89 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 92 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 106 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local 107 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 108 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 111 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
|
D | avx512f-nr1fma1adj-x80.c | 96 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local 97 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 98 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 101 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 116 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local 117 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 118 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 121 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
|
D | fma3-nr1fma1adj-x40.c | 97 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local 98 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 99 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 102 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 116 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local 117 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 118 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 121 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
|
D | avx512f-nr1fma1adj-x96.c | 106 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local 107 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 108 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 111 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 126 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local 127 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 128 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 131 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
|
D | fma3-nr1fma1adj-x48.c | 107 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local 108 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 109 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 112 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 126 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local 127 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 128 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 131 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x112.c | 116 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local 117 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 118 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 121 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 136 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local 137 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 138 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 141 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
|
D | fma3-nr1fma1adj-x56.c | 117 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local 118 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 119 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 122 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 136 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local 137 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 138 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 141 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
|
/external/XNNPACK/src/f32-vsqrt/ |
D | avx512f-nr1fma1adj.c.in | 41 __m512 vhalfrsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vhalf); 44 … const __m512 vresidual${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf); 47 …vhalfrsqrtx${ABC[N]} = _mm512_fmadd_ps(vhalfrsqrtx${ABC[N]}, vresidual${ABC[N]}, vhalfrsqrtx${ABC[… 54 …const __m512 vy${ABC[N]} = _mm512_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC… 67 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); variable 68 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 69 vhalfrsqrtx = _mm512_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); 72 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); 87 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); variable 88 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); [all …]
|
D | fma3-nr1fma1adj.c.in | 42 __m256 vhalfrsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vhalf); 45 … const __m256 vresidual${ABC[N]} = _mm256_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf); 48 …vhalfrsqrtx${ABC[N]} = _mm256_fmadd_ps(vhalfrsqrtx${ABC[N]}, vresidual${ABC[N]}, vhalfrsqrtx${ABC[… 55 …const __m256 vy${ABC[N]} = _mm256_fmadd_ps(vhalfrsqrtx${ABC[N]}, vadjustment${ABC[N]}, vsqrtx${ABC… 68 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); variable 69 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 70 vhalfrsqrtx = _mm256_fmadd_ps(vhalfrsqrtx, vresidual, vhalfrsqrtx); 73 const __m256 vy = _mm256_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); 87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); variable 88 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); [all …]
|
D | neonfma-nr2fma1adj.c.in | 38 float32x4_t vhalfrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vhalf); 41 …float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]… 44 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx… 48 vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}); 51 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx… 58 …const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadju…
|