Home
last modified time | relevance | path

Searched refs:vhalf (Results 1 – 25 of 63) sorted by relevance

123

/external/XNNPACK/src/f32-vsqrt/gen/
Dneonfma-nr2fma1adj-x40.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local
53 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
55 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
57 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
59 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
61 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
63 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
65 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
67 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
69 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40()
[all …]
Dneonfma-nr2fma1adj-x36.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() local
51 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
53 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
55 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
57 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
59 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
61 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
63 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
65 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
67 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36()
[all …]
Dneonfma-nr2fma1adj-x32.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() local
49 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
51 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
53 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
55 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
57 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
59 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
61 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
63 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
65 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32()
[all …]
Dneonfma-nr2fma1adj-x28.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() local
47 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
49 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
51 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
53 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
55 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
57 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
59 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
61 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
62 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28()
[all …]
Dneonfma-nr2fma1adj-x24.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() local
45 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
47 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
49 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
51 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
53 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
55 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
57 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
58 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
59 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24()
[all …]
Davx512f-nr1fma1adj-x128.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local
50 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
52 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
54 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
56 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
58 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
60 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
62 __m512 vhalfrsqrtx6 = _mm512_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
64 __m512 vhalfrsqrtx7 = _mm512_mul_ps(vrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
66 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
[all …]
Dneonfma-nr2fma1adj-x20.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local
43 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
45 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
47 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
49 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
51 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
53 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
54 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
55 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
56 float32x4_t vresidualCDEF = vfmsq_f32(vhalf, vsqrtxCDEF, vhalfrsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20()
[all …]
Dfma3-nr1fma1adj-x64.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() local
51 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
53 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
55 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
57 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
59 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
61 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
63 __m256 vhalfrsqrtx6 = _mm256_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
65 __m256 vhalfrsqrtx7 = _mm256_mul_ps(vrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
67 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64()
[all …]
Davx512f-nr1fma1adj-x112.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local
48 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
50 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
52 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
54 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
56 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
58 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
60 __m512 vhalfrsqrtx6 = _mm512_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
62 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
63 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112()
[all …]
Davx512f-nr1fma1adj-x96.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local
46 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
48 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
50 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
52 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
54 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
56 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
58 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
59 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
60 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96()
[all …]
Dfma3-nr1fma1adj-x56.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local
49 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
51 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
53 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
55 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
57 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
59 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
61 __m256 vhalfrsqrtx6 = _mm256_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
63 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
64 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56()
[all …]
Dfma3-nr1fma1adj-x48.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local
47 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
49 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
51 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
53 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
55 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
57 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
59 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
60 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
61 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48()
[all …]
Dneonfma-nr2fma1adj-x16.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local
41 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
43 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
45 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
47 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
49 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
50 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
51 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
52 float32x4_t vresidualCDEF = vfmsq_f32(vhalf, vsqrtxCDEF, vhalfrsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
63 vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16()
[all …]
Davx512f-nr1fma1adj-x80.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local
44 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
46 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
48 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
50 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
52 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
54 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
55 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
56 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
57 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80()
[all …]
Davx512f-nr1fma1adj-x64.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local
42 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
44 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
46 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
48 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
50 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
51 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
52 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
53 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
[all …]
Dneonfma-nr1rsqrts1fma1adj-x40.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() local
86 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
88 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
90 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
92 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
94 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
96 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
98 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
100 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
102 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40()
[all …]
Dfma3-nr1fma1adj-x40.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local
45 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
47 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
49 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
51 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
53 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
55 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
56 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
57 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
58 const __m256 vresidual3 = _mm256_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40()
[all …]
Dneonfma-nr1rsqrts1fma1adj-x36.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() local
81 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
83 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
85 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
87 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
89 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
91 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
93 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
95 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
97 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36()
[all …]
Dfma3-nr1fma1adj-x32.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local
43 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
45 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
47 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
49 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
51 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
52 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
53 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
54 const __m256 vresidual3 = _mm256_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
[all …]
Dneonfma-nr1rsqrts1fma1adj-x32.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() local
76 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
78 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
80 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
82 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
84 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
86 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
88 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
90 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
92 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32()
[all …]
Dneonfma-nr2fma1adj-x12.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local
39 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
41 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
43 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
45 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
46 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
47 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
56 vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
57 vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
58 vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
Dneonfma-nr1rsqrts1fma1adj-x28.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() local
71 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
73 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
75 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
77 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
79 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
81 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
83 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
85 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
86 const float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28()
[all …]
Davx512f-nr1fma1adj-x48.c28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local
40 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
42 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
44 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
46 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
47 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
48 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
76 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
77 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
96 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
[all …]
Dfma3-nr1fma1adj-x24.c29 const __m256 vhalf = _mm256_broadcast_ss(&params->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local
41 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
43 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
45 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
47 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
48 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
49 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
77 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
78 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
96 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
[all …]
Dneonfma-nr1rsqrts1fma1adj-x24.c28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() local
66 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
68 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
70 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
72 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
74 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
76 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
78 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
79 const float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
80 const float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24()
[all …]

123