/external/XNNPACK/src/f32-vsqrt/gen/ |
D | neonfma-nr2fma1adj-x40.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() local 53 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 55 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 57 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 59 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 61 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 63 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 65 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 67 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() 69 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40() [all …]
|
D | neonfma-nr2fma1adj-x36.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() local 51 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 53 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 55 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 57 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 59 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 61 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 63 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 65 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() 67 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36() [all …]
|
D | neonfma-nr2fma1adj-x32.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() local 49 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 51 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 53 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 55 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 57 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 59 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 61 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 63 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() 65 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32() [all …]
|
D | neonfma-nr2fma1adj-x28.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() local 47 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 49 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 51 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 53 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 55 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 57 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 59 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 61 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() 62 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28() [all …]
|
D | neonfma-nr2fma1adj-x24.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() local 45 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 47 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 49 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 51 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 53 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 55 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 57 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 58 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() 59 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24() [all …]
|
D | avx512f-nr1fma1adj-x128.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() local 50 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 52 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 54 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 56 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 58 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 60 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 62 __m512 vhalfrsqrtx6 = _mm512_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 64 __m512 vhalfrsqrtx7 = _mm512_mul_ps(vrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 66 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() [all …]
|
D | neonfma-nr2fma1adj-x20.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() local 43 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 45 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 47 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 49 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 51 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 53 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 54 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 55 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() 56 float32x4_t vresidualCDEF = vfmsq_f32(vhalf, vsqrtxCDEF, vhalfrsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20() [all …]
|
D | fma3-nr1fma1adj-x64.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() local 51 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 53 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 55 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 57 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 59 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 61 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 63 __m256 vhalfrsqrtx6 = _mm256_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 65 __m256 vhalfrsqrtx7 = _mm256_mul_ps(vrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() 67 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64() [all …]
|
D | avx512f-nr1fma1adj-x112.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() local 48 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 50 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 52 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 54 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 56 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 58 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 60 __m512 vhalfrsqrtx6 = _mm512_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 62 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 63 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() [all …]
|
D | avx512f-nr1fma1adj-x96.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() local 46 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 48 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 50 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 52 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 54 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 56 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 58 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 59 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 60 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() [all …]
|
D | fma3-nr1fma1adj-x56.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() local 49 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 51 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 53 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 55 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 57 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 59 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 61 __m256 vhalfrsqrtx6 = _mm256_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 63 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() 64 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56() [all …]
|
D | fma3-nr1fma1adj-x48.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() local 47 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 49 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 51 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 53 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 55 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 57 __m256 vhalfrsqrtx5 = _mm256_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 59 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 60 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() 61 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48() [all …]
|
D | neonfma-nr2fma1adj-x16.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() local 41 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 43 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 45 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 47 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 49 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 50 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 51 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 52 float32x4_t vresidualCDEF = vfmsq_f32(vhalf, vsqrtxCDEF, vhalfrsqrtxCDEF); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() 63 vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16() [all …]
|
D | avx512f-nr1fma1adj-x80.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() local 44 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 46 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 48 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 50 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 52 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 54 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 55 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 56 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 57 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() [all …]
|
D | avx512f-nr1fma1adj-x64.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local 42 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 44 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 46 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 48 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 50 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 51 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 52 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 53 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() [all …]
|
D | neonfma-nr1rsqrts1fma1adj-x40.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() local 86 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 88 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 90 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 92 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 94 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 96 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 98 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 100 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() 102 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40() [all …]
|
D | fma3-nr1fma1adj-x40.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() local 45 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 47 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 49 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 51 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 53 __m256 vhalfrsqrtx4 = _mm256_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 55 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 56 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 57 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() 58 const __m256 vresidual3 = _mm256_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40() [all …]
|
D | neonfma-nr1rsqrts1fma1adj-x36.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() local 81 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 83 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 85 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 87 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 89 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 91 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 93 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 95 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() 97 float32x4_t vhalfrsqrtxWXYZ = vmulq_f32(vrsqrtxWXYZ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36() [all …]
|
D | fma3-nr1fma1adj-x32.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local 43 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 45 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 47 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 49 __m256 vhalfrsqrtx3 = _mm256_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 51 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 52 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 53 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 54 const __m256 vresidual3 = _mm256_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() [all …]
|
D | neonfma-nr1rsqrts1fma1adj-x32.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() local 76 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 78 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 80 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 82 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 84 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 86 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 88 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 90 float32x4_t vhalfrsqrtxSTUV = vmulq_f32(vrsqrtxSTUV, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() 92 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32() [all …]
|
D | neonfma-nr2fma1adj-x12.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() local 39 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 41 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 43 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 45 float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 46 float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 47 float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 56 vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 57 vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12() 58 vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12()
|
D | neonfma-nr1rsqrts1fma1adj-x28.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() local 71 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 73 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 75 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 77 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 79 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 81 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 83 float32x4_t vhalfrsqrtxOPQR = vmulq_f32(vrsqrtxOPQR, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 85 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() 86 const float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28() [all …]
|
D | avx512f-nr1fma1adj-x48.c | 28 const __m512 vhalf = _mm512_set1_ps(params->fma.half); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local 40 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 42 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 44 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 46 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 47 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 48 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 76 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 77 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 96 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() [all …]
|
D | fma3-nr1fma1adj-x24.c | 29 const __m256 vhalf = _mm256_broadcast_ss(¶ms->fma.half); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 41 __m256 vhalfrsqrtx0 = _mm256_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 43 __m256 vhalfrsqrtx1 = _mm256_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 45 __m256 vhalfrsqrtx2 = _mm256_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 47 const __m256 vresidual0 = _mm256_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 48 const __m256 vresidual1 = _mm256_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 49 const __m256 vresidual2 = _mm256_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 77 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 78 const __m256 vresidual = _mm256_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 96 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() [all …]
|
D | neonfma-nr1rsqrts1fma1adj-x24.c | 28 const float32x4_t vhalf = vmovq_n_f32(0.5f); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() local 66 float32x4_t vhalfrsqrtx0123 = vmulq_f32(vrsqrtx0123, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 68 float32x4_t vhalfrsqrtx4567 = vmulq_f32(vrsqrtx4567, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 70 float32x4_t vhalfrsqrtx89AB = vmulq_f32(vrsqrtx89AB, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 72 float32x4_t vhalfrsqrtxCDEF = vmulq_f32(vrsqrtxCDEF, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 74 float32x4_t vhalfrsqrtxGHIJ = vmulq_f32(vrsqrtxGHIJ, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 76 float32x4_t vhalfrsqrtxKLMN = vmulq_f32(vrsqrtxKLMN, vhalf); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 78 const float32x4_t vresidual0123 = vfmsq_f32(vhalf, vsqrtx0123, vhalfrsqrtx0123); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 79 const float32x4_t vresidual4567 = vfmsq_f32(vhalf, vsqrtx4567, vhalfrsqrtx4567); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() 80 const float32x4_t vresidual89AB = vfmsq_f32(vhalf, vsqrtx89AB, vhalfrsqrtx89AB); in xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24() [all …]
|