/external/XNNPACK/src/f32-vsqrt/gen/ |
D | avx512f-nr1fma1adj-x128.c | 66 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 67 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 68 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 69 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 70 const __m512 vresidual4 = _mm512_fnmadd_ps(vsqrtx4, vhalfrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 71 const __m512 vresidual5 = _mm512_fnmadd_ps(vsqrtx5, vhalfrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 72 const __m512 vresidual6 = _mm512_fnmadd_ps(vsqrtx6, vhalfrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 73 const __m512 vresidual7 = _mm512_fnmadd_ps(vsqrtx7, vhalfrsqrtx7, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 92 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() 93 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128() [all …]
|
D | avx512f-nr1fma1adj-x112.c | 62 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 63 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 64 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 65 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 66 const __m512 vresidual4 = _mm512_fnmadd_ps(vsqrtx4, vhalfrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 67 const __m512 vresidual5 = _mm512_fnmadd_ps(vsqrtx5, vhalfrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 68 const __m512 vresidual6 = _mm512_fnmadd_ps(vsqrtx6, vhalfrsqrtx6, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 85 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 86 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() 87 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112() [all …]
|
D | avx512f-nr1fma1adj-x96.c | 58 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 59 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 60 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 61 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 62 const __m512 vresidual4 = _mm512_fnmadd_ps(vsqrtx4, vhalfrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 63 const __m512 vresidual5 = _mm512_fnmadd_ps(vsqrtx5, vhalfrsqrtx5, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 78 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 79 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 80 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() 81 const __m512 vadjustment3 = _mm512_fnmadd_ps(vsqrtx3, vsqrtx3, vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96() [all …]
|
D | avx512f-nr1fma1adj-x80.c | 54 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 55 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 56 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 57 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 58 const __m512 vresidual4 = _mm512_fnmadd_ps(vsqrtx4, vhalfrsqrtx4, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 71 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 72 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 73 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 74 const __m512 vadjustment3 = _mm512_fnmadd_ps(vsqrtx3, vsqrtx3, vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() 75 const __m512 vadjustment4 = _mm512_fnmadd_ps(vsqrtx4, vsqrtx4, vx4); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80() [all …]
|
D | avx512f-nr1fma1adj-x64.c | 50 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 51 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 52 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 53 const __m512 vresidual3 = _mm512_fnmadd_ps(vsqrtx3, vhalfrsqrtx3, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 64 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 65 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 66 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 67 const __m512 vadjustment3 = _mm512_fnmadd_ps(vsqrtx3, vsqrtx3, vx3); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() 90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64() [all …]
|
D | avx512f-nr1fma1adj-x48.c | 46 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 47 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 48 const __m512 vresidual2 = _mm512_fnmadd_ps(vsqrtx2, vhalfrsqrtx2, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 57 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 58 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 59 const __m512 vadjustment2 = _mm512_fnmadd_ps(vsqrtx2, vsqrtx2, vx2); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 77 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 80 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 97 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48() 100 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48()
|
D | avx512f-nr1fma1adj-x32.c | 42 const __m512 vresidual0 = _mm512_fnmadd_ps(vsqrtx0, vhalfrsqrtx0, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 43 const __m512 vresidual1 = _mm512_fnmadd_ps(vsqrtx1, vhalfrsqrtx1, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 50 const __m512 vadjustment0 = _mm512_fnmadd_ps(vsqrtx0, vsqrtx0, vx0); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 51 const __m512 vadjustment1 = _mm512_fnmadd_ps(vsqrtx1, vsqrtx1, vx1); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 67 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 70 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 87 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32() 90 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32()
|
D | avx512f-nr1fma1adj-x16.c | 36 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 39 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 56 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16() 59 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); in xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16()
|
/external/XNNPACK/src/f32-vsqrt/ |
D | avx512f-nr1fma1adj.c.in | 44 … const __m512 vresidual${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vhalfrsqrtx${ABC[N]}, vhalf); 51 …const __m512 vadjustment${ABC[N]} = _mm512_fnmadd_ps(vsqrtx${ABC[N]}, vsqrtx${ABC[N]}, vx${ABC[N]}… 68 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 71 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx); 88 const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); 91 const __m512 vadjustment = _mm512_fnmadd_ps(vsqrtx, vsqrtx, vx);
|
/external/XNNPACK/src/f32-sigmoid/gen/ |
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x128.c | 159 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 160 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 161 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 162 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 163 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 164 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 165 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 166 vr7 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr7, vd7, vone), vr7, vr7); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 218 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128() 253 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x128()
|
D | avx512f-rr1-p5-scalef-nr1fma-x128.c | 156 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 157 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 158 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 159 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 160 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 161 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 162 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 163 vr7 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr7, vd7, vone), vr7, vr7); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 214 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128() 248 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x128()
|
D | avx512f-rr1-p5-scalef-nr1fma-x96.c | 130 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 131 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 132 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 133 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 134 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 135 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 180 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96() 214 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x96()
|
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x112.c | 146 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 147 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 148 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 149 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 150 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 151 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 152 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 201 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112() 236 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x112()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x112.c | 152 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 153 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 154 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 155 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 156 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 157 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 158 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 207 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112() 242 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x112()
|
D | avx512f-rr1-p5-scalef-nr1fma-x112.c | 143 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 144 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 145 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 146 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 147 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 148 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 149 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 197 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112() 231 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x112()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x128.c | 165 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 166 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 167 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 168 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 169 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 170 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 171 vr6 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr6, vd6, vone), vr6, vr6); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 172 vr7 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr7, vd7, vone), vr7, vr7); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 224 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128() 259 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x128()
|
D | avx512f-rr1-p5-scalef-nr1fma-x64.c | 104 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64() 105 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64() 106 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64() 107 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64() 146 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64() 180 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x64()
|
D | avx512f-rr1-p5-scalef-nr1fma-x80.c | 117 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 118 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 119 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 120 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 121 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 163 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80() 197 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_x80()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x80.c | 126 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 127 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 128 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 129 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 130 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 173 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80() 208 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x80()
|
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x80.c | 120 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 121 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 122 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 123 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 124 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 167 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80() 202 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x80()
|
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x96.c | 133 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 134 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 135 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 136 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 137 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 138 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 184 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96() 219 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x96()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x96.c | 139 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 140 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 141 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 142 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 143 vr4 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr4, vd4, vone), vr4, vr4); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 144 vr5 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr5, vd5, vone), vr5, vr5); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 190 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96() 225 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x96()
|
D | avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x64.c | 107 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64() 108 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64() 109 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64() 110 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64() 150 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64() 185 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_x64()
|
D | avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x64.c | 113 vr0 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr0, vd0, vone), vr0, vr0); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64() 114 vr1 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr1, vd1, vone), vr1, vr1); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64() 115 vr2 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr2, vd2, vone), vr2, vr2); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64() 116 vr3 = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr3, vd3, vone), vr3, vr3); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64() 156 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64() 191 vr = _mm512_fmadd_ps(_mm512_fnmadd_ps(vr, vd, vone), vr, vr); in xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_x64()
|
/external/XNNPACK/src/math/ |
D | sqrt-avx512f-nr2fma.c | 36 __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma() 40 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf); in xnn_math_f32_sqrt__avx512f_nr2fma()
|