/external/XNNPACK/src/math/ |
D | sqrt-neon-nr3rsqrts.c | 26 float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neon_nr3rsqrts() local 31 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vx, vmulq_f32(vrsqrtx, vrsqrtx))); in xnn_math_f32_sqrt__neon_nr3rsqrts() 32 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vmulq_f32(vrsqrtx, vx), vrsqrtx)); in xnn_math_f32_sqrt__neon_nr3rsqrts() 33 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vmulq_f32(vrsqrtx, vx), vrsqrtx)); in xnn_math_f32_sqrt__neon_nr3rsqrts() 36 const float32x4_t vy = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neon_nr3rsqrts()
|
D | sqrt-neon-nr2rsqrts.c | 26 float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neon_nr2rsqrts() local 31 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vx, vmulq_f32(vrsqrtx, vrsqrtx))); in xnn_math_f32_sqrt__neon_nr2rsqrts() 32 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vmulq_f32(vrsqrtx, vx), vrsqrtx)); in xnn_math_f32_sqrt__neon_nr2rsqrts() 35 const float32x4_t vy = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neon_nr2rsqrts()
|
D | sqrt-sse-nr2mac.c | 29 __m128 vrsqrtx = _mm_rsqrt_ps(vx); in xnn_math_f32_sqrt__sse_nr2mac() local 34 …vrsqrtx = _mm_mul_ps(vrsqrtx, _mm_sub_ps(_mm_mul_ps(_mm_mul_ps(vhalfx, vrsqrtx), vrsqrtx), vthree_… in xnn_math_f32_sqrt__sse_nr2mac() 35 …vrsqrtx = _mm_mul_ps(vrsqrtx, _mm_sub_ps(_mm_mul_ps(_mm_mul_ps(vhalfx, vrsqrtx), vrsqrtx), vthree_… in xnn_math_f32_sqrt__sse_nr2mac() 38 const __m128 vy = _mm_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__sse_nr2mac()
|
D | sqrt-neonfma-nr1rsqrts1fma1adj.c | 27 float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() local 31 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vx, vmulq_f32(vrsqrtx, vrsqrtx))); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 33 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj() 34 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr1rsqrts1fma1adj()
|
D | sqrt-neon-nr1rsqrts.c | 26 float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neon_nr1rsqrts() local 30 vrsqrtx = vmulq_f32(vrsqrtx, vrsqrtsq_f32(vx, vmulq_f32(vrsqrtx, vrsqrtx))); in xnn_math_f32_sqrt__neon_nr1rsqrts() 33 const float32x4_t vy = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neon_nr1rsqrts()
|
D | sqrt-sse-nr1mac.c | 29 __m128 vrsqrtx = _mm_rsqrt_ps(vx); in xnn_math_f32_sqrt__sse_nr1mac() local 34 …vrsqrtx = _mm_mul_ps(vrsqrtx, _mm_sub_ps(vthree_halfs, _mm_mul_ps(vhalfx, _mm_mul_ps(vrsqrtx, vrsq… in xnn_math_f32_sqrt__sse_nr1mac() 37 const __m128 vy = _mm_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__sse_nr1mac()
|
D | sqrt-sse-hh1mac.c | 30 __m128 vrsqrtx = _mm_rsqrt_ps(vx); in xnn_math_f32_sqrt__sse_hh1mac() local 35 const __m128 vt = _mm_mul_ps(_mm_mul_ps(vx, vrsqrtx), vrsqrtx); in xnn_math_f32_sqrt__sse_hh1mac() 36 …vrsqrtx = _mm_mul_ps(vrsqrtx, _mm_add_ps(_mm_mul_ps(vt, _mm_sub_ps(_mm_mul_ps(vt, vc0375), vc1250)… in xnn_math_f32_sqrt__sse_hh1mac() 39 const __m128 vy = _mm_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__sse_hh1mac()
|
D | sqrt-fma3-nr1fma.c | 28 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_math_f32_sqrt__fma3_nr1fma() local 29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr1fma() 30 const __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr1fma()
|
D | sqrt-neonfma-nr1fma.c | 27 const float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neonfma_nr1fma() local 28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr1fma() 29 const float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr1fma()
|
D | sqrt-avx512f-nr1fma.c | 28 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_math_f32_sqrt__avx512f_nr1fma() local 29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma() 30 const __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma()
|
D | sqrt-neonfma-nr2fma.c | 27 const float32x4_t vrsqrtx = vrsqrteq_f32(vx); in xnn_math_f32_sqrt__neonfma_nr2fma() local 28 float32x4_t vsqrtx = vmulq_f32(vrsqrtx, vx); in xnn_math_f32_sqrt__neonfma_nr2fma() 29 float32x4_t vhalfrsqrtx = vmulq_f32(vrsqrtx, vhalf); in xnn_math_f32_sqrt__neonfma_nr2fma()
|
D | sqrt-avx512f-nr2fma.c | 28 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_math_f32_sqrt__avx512f_nr2fma() local 29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr2fma() 30 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
|
D | sqrt-fma3-nr2fma.c | 28 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_math_f32_sqrt__fma3_nr2fma() local 29 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__fma3_nr2fma() 30 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__fma3_nr2fma()
|
D | sqrt-avx512f-nr1fma1adj.c | 28 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() local 29 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_math_f32_sqrt__avx512f_nr1fma1adj() 30 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr1fma1adj()
|
/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x16.c | 33 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 34 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 35 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 53 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() local 54 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 55 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
|
D | fma3-nr1fma1adj-x8.c | 34 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 35 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 36 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 53 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() local 54 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8() 55 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8()
|
D | avx512f-nr1fma1adj-x32.c | 64 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 65 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 66 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 84 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() local 85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
|
D | fma3-nr1fma1adj-x16.c | 65 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 66 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 67 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 84 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() local 85 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16() 86 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16()
|
D | fma3-nr1fma1adj-x24.c | 75 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 76 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 77 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 94 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() local 95 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24() 96 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24()
|
D | avx512f-nr1fma1adj-x48.c | 74 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local 75 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 76 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 94 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() local 95 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 96 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x64.c | 84 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local 85 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 86 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 104 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() local 105 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 106 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64()
|
D | fma3-nr1fma1adj-x32.c | 85 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local 86 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 104 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() local 105 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32() 106 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32()
|
/external/XNNPACK/src/f32-vsqrt/ |
D | avx512f-nr1fma1adj.c.in | 37 const __m512 vrsqrtx${ABC[N]} = _mm512_rsqrt14_ps(vx${ABC[N]}); 40 __m512 vsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]}); 41 __m512 vhalfrsqrtx${ABC[N]} = _mm512_mul_ps(vrsqrtx${ABC[N]}, vhalf); 65 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); variable 66 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); 67 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); 85 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); variable 86 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); 87 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf);
|
D | fma3-nr1fma1adj.c.in | 38 const __m256 vrsqrtx${ABC[N]} = _mm256_rsqrt_ps(vx${ABC[N]}); 41 __m256 vsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vx${ABC[N]}); 42 __m256 vhalfrsqrtx${ABC[N]} = _mm256_mul_ps(vrsqrtx${ABC[N]}, vhalf); 66 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); variable 67 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); 68 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf); 85 const __m256 vrsqrtx = _mm256_rsqrt_ps(vx); variable 86 __m256 vsqrtx = _mm256_mul_ps(vrsqrtx, vx); 87 __m256 vhalfrsqrtx = _mm256_mul_ps(vrsqrtx, vhalf);
|
D | neonfma-nr1rsqrts1fma1adj.c.in | 34 float32x4_t vrsqrtx${ABC[N:N+4]} = vrsqrteq_f32(vx${ABC[N:N+4]}); 37 const float32x4_t vrx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vrsqrtx${ABC[N:N+4]}); 43 vrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vcorrection${ABC[N:N+4]}); 46 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]}); 47 float32x4_t vhalfrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vhalf);
|