Lines Matching refs:__m512

28   const __m512 vhalf = _mm512_set1_ps(params->fma.half);  in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
30 const __m512 vx0 = _mm512_loadu_ps(x); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
31 const __m512 vx1 = _mm512_loadu_ps(x + 16); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
32 const __m512 vx2 = _mm512_loadu_ps(x + 32); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
33 const __m512 vx3 = _mm512_loadu_ps(x + 48); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
34 const __m512 vx4 = _mm512_loadu_ps(x + 64); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
35 const __m512 vx5 = _mm512_loadu_ps(x + 80); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
36 const __m512 vx6 = _mm512_loadu_ps(x + 96); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
37 const __m512 vx7 = _mm512_loadu_ps(x + 112); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
40 const __m512 vrsqrtx0 = _mm512_rsqrt14_ps(vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
41 const __m512 vrsqrtx1 = _mm512_rsqrt14_ps(vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
42 const __m512 vrsqrtx2 = _mm512_rsqrt14_ps(vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
43 const __m512 vrsqrtx3 = _mm512_rsqrt14_ps(vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
44 const __m512 vrsqrtx4 = _mm512_rsqrt14_ps(vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
45 const __m512 vrsqrtx5 = _mm512_rsqrt14_ps(vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
46 const __m512 vrsqrtx6 = _mm512_rsqrt14_ps(vx6); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
47 const __m512 vrsqrtx7 = _mm512_rsqrt14_ps(vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
49 __m512 vsqrtx0 = _mm512_mul_ps(vrsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
50 __m512 vhalfrsqrtx0 = _mm512_mul_ps(vrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
51 __m512 vsqrtx1 = _mm512_mul_ps(vrsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
52 __m512 vhalfrsqrtx1 = _mm512_mul_ps(vrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
53 __m512 vsqrtx2 = _mm512_mul_ps(vrsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
54 __m512 vhalfrsqrtx2 = _mm512_mul_ps(vrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
55 __m512 vsqrtx3 = _mm512_mul_ps(vrsqrtx3, vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
56 __m512 vhalfrsqrtx3 = _mm512_mul_ps(vrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
57 __m512 vsqrtx4 = _mm512_mul_ps(vrsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
58 __m512 vhalfrsqrtx4 = _mm512_mul_ps(vrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
59 __m512 vsqrtx5 = _mm512_mul_ps(vrsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
60 __m512 vhalfrsqrtx5 = _mm512_mul_ps(vrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
61 __m512 vsqrtx6 = _mm512_mul_ps(vrsqrtx6, vx6); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
62 __m512 vhalfrsqrtx6 = _mm512_mul_ps(vrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
63 __m512 vsqrtx7 = _mm512_mul_ps(vrsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
64 __m512 vhalfrsqrtx7 = _mm512_mul_ps(vrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
66 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
67 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
68 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
69 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
70 const __m512 vresidual4 = _mm512_fnmadd_ps(vsqrtx4, vhalfrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
71 const __m512 vresidual5 = _mm512_fnmadd_ps(vsqrtx5, vhalfrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
72 const __m512 vresidual6 = _mm512_fnmadd_ps(vsqrtx6, vhalfrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
73 const __m512 vresidual7 = _mm512_fnmadd_ps(vsqrtx7, vhalfrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
92 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
93 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
94 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
95 const __m512 vadjustment3 = _mm512_fnmadd_ps(vsqrtx3, vsqrtx3, vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
96 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
97 const __m512 vadjustment5 = _mm512_fnmadd_ps(vsqrtx5, vsqrtx5, vx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
98 const __m512 vadjustment6 = _mm512_fnmadd_ps(vsqrtx6, vsqrtx6, vx6); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
99 const __m512 vadjustment7 = _mm512_fnmadd_ps(vsqrtx7, vsqrtx7, vx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
101 const __m512 vy0 = _mm512_fmadd_ps(vhalfrsqrtx0, vadjustment0, vsqrtx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
102 const __m512 vy1 = _mm512_fmadd_ps(vhalfrsqrtx1, vadjustment1, vsqrtx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
103 const __m512 vy2 = _mm512_fmadd_ps(vhalfrsqrtx2, vadjustment2, vsqrtx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
104 const __m512 vy3 = _mm512_fmadd_ps(vhalfrsqrtx3, vadjustment3, vsqrtx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
105 const __m512 vy4 = _mm512_fmadd_ps(vhalfrsqrtx4, vadjustment4, vsqrtx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
106 const __m512 vy5 = _mm512_fmadd_ps(vhalfrsqrtx5, vadjustment5, vsqrtx5); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
107 const __m512 vy6 = _mm512_fmadd_ps(vhalfrsqrtx6, vadjustment6, vsqrtx6); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
108 const __m512 vy7 = _mm512_fmadd_ps(vhalfrsqrtx7, vadjustment7, vsqrtx7); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
121 const __m512 vx = _mm512_loadu_ps(x); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
124 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
125 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
126 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
127 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
130 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
131 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
143 const __m512 vx = _mm512_maskz_loadu_ps(vmask, x); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
144 const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
145 __m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
146 __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
147 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
150 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()
151 const __m512 vy = _mm512_fmadd_ps(vhalfrsqrtx, vadjustment, vsqrtx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128()