/external/XNNPACK/src/f32-velu/gen/ |
D | velu-scalar-rr2-p6-x5.c | 78 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() local 84 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 104 vt4 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 111 float vp4 = vc6 * vt4 + vc5; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 117 vp4 = vp4 * vt4 + vc4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 123 vp4 = vp4 * vt4 + vc3; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 129 vp4 = vp4 * vt4 + vc2; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 135 vp4 *= vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 145 vt4 *= vs4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5() 152 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x5()
|
D | velu-wasm-rr2-p6-x5.c | 78 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() local 84 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 91 float vp4 = vc6 * vt4 + vc5; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 97 vp4 = vp4 * vt4 + vc4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 103 vp4 = vp4 * vt4 + vc3; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 109 vp4 = vp4 * vt4 + vc2; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 115 vp4 *= vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 125 vt4 *= vs4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5() 132 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x5()
|
D | velu-scalar-rr2-p6-x6.c | 83 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() local 90 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 111 vt4 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 122 float vp4 = vc6 * vt4 + vc5; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 129 vp4 = vp4 * vt4 + vc4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 136 vp4 = vp4 * vt4 + vc3; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 143 vp4 = vp4 * vt4 + vc2; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 150 vp4 *= vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 161 vt4 *= vs4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6() 170 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__scalar_rr2_p6_x6()
|
D | velu-wasm-rr2-p6-x6.c | 83 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() local 90 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 98 float vp4 = vc6 * vt4 + vc5; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 105 vp4 = vp4 * vt4 + vc4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 112 vp4 = vp4 * vt4 + vc3; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 119 vp4 = vp4 * vt4 + vc2; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 126 vp4 *= vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 137 vt4 *= vs4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6() 146 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__wasm_rr2_p6_x6()
|
D | velu-scalar-rr2-lut16-p3-x5.c | 87 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() local 110 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 113 vt4 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 120 float vp4 = vc3 * vt4 + vc2; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 126 vp4 *= vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 136 vt4 *= vs4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5() 143 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5()
|
D | velu-scalar-rr2-lut16-p3-x6.c | 93 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() local 118 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 121 vt4 = 0.0f; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 133 float vp4 = vc3 * vt4 + vc2; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 140 vp4 *= vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 151 vt4 *= vs4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6() 160 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6()
|
D | velu-wasm-rr2-lut16-p3-x5.c | 87 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() local 94 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() 100 float vp4 = vc3 * vt4 + vc2; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() 106 vp4 *= vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() 116 vt4 *= vs4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5() 123 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5()
|
D | velu-wasm-rr2-lut16-p3-x6.c | 93 float vt4 = vn4 * vminus_ln2_hi + vz4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() local 102 vt4 = vn4 * vminus_ln2_lo + vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 109 float vp4 = vc3 * vt4 + vc2; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 116 vp4 *= vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 127 vt4 *= vs4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6() 136 vp4 = vp4 * vt4 + vt4; in xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6()
|
D | velu-avx-rr2-p6-x40.c | 88 __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vz4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() local 95 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 101 __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc6, vt4), vc5); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 107 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 113 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 119 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 125 vp4 = _mm256_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 135 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40() 142 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x40()
|
D | velu-avx512f-rr1-p6-x80.c | 77 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vz4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() local 83 __m512 vp4 = _mm512_fmadd_ps(vc6, vt4, vc5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 89 vp4 = _mm512_fmadd_ps(vp4, vt4, vc4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 95 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 101 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 111 vp4 = _mm512_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 112 vt4 = _mm512_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80() 124 vp4 = _mm512_fmadd_ps(vp4, vt4, vt4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x80()
|
D | velu-avx2-rr1-p6-x40.c | 77 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() local 83 __m256 vp4 = _mm256_fmadd_ps(vc6, vt4, vc5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 89 vp4 = _mm256_fmadd_ps(vp4, vt4, vc4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 95 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 101 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 111 vp4 = _mm256_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 112 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40() 123 vp4 = _mm256_fmadd_ps(vp4, vt4, vt4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x40()
|
D | velu-avx512f-rr1-p6-x96.c | 82 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vz4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() local 89 __m512 vp4 = _mm512_fmadd_ps(vc6, vt4, vc5); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 96 vp4 = _mm512_fmadd_ps(vp4, vt4, vc4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 103 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 110 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 121 vp4 = _mm512_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 122 vt4 = _mm512_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96() 137 vp4 = _mm512_fmadd_ps(vp4, vt4, vt4); in xnn_f32_velu_ukernel__avx512f_rr1_p6_x96()
|
D | velu-avx2-rr1-p6-x48.c | 82 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vz4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() local 89 __m256 vp4 = _mm256_fmadd_ps(vc6, vt4, vc5); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 96 vp4 = _mm256_fmadd_ps(vp4, vt4, vc4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 103 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 110 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 121 vp4 = _mm256_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 122 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48() 135 vp4 = _mm256_fmadd_ps(vp4, vt4, vt4); in xnn_f32_velu_ukernel__avx2_rr1_p6_x48()
|
D | velu-avx-rr2-p6-x48.c | 94 __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vz4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() local 103 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 110 __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc6, vt4), vc5); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 117 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 124 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 131 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 138 vp4 = _mm256_mul_ps(vp4, vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 149 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48() 158 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vt4); in xnn_f32_velu_ukernel__avx_rr2_p6_x48()
|
/external/XNNPACK/src/f32-vscaleexpminusmax/gen/ |
D | avx2-p5-x40.c | 89 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() local 95 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 102 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 108 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 114 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 120 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 130 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40() 136 __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40()
|
D | avx2-p5-x48.c | 94 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() local 101 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 109 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 116 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 123 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 130 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 141 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48() 148 __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48()
|
D | avx512f-p5-scalef-x80.c | 70 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() local 76 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() 83 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() 89 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() 95 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() 101 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80() 107 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80()
|
D | avx512f-p5-scalef-x96.c | 73 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() local 80 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() 88 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() 95 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() 102 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() 109 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96() 116 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96()
|
D | avx2-p5-x56.c | 99 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() local 107 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 116 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 124 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 132 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 140 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 152 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56() 160 __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); in xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56()
|
D | avx512f-p5-scalef-x112.c | 76 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() local 84 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() 93 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() 101 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() 109 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() 117 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112() 125 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112()
|
/external/XNNPACK/src/f32-vscaleextexp/gen/ |
D | avx512f-p5-scalef-x80.c | 64 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() local 70 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() 77 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() 83 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() 89 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() 95 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80() 101 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x80()
|
D | avx512f-p5-scalef-x96.c | 66 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() local 73 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() 81 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() 88 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() 95 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() 102 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96() 109 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x96()
|
D | avx2-p5-x40.c | 70 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() local 76 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() 83 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() 89 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() 95 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() 101 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40() 107 vp4 = _mm256_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleextexp_ukernel__avx2_p5_x40()
|
D | avx512f-p5-scalef-x112.c | 68 __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() local 76 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() 85 __m512 vp4 = _mm512_fmadd_ps(vc5, vt4, vc4); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() 93 vp4 = _mm512_fmadd_ps(vp4, vt4, vc3); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() 101 vp4 = _mm512_fmadd_ps(vp4, vt4, vc2); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() 109 vp4 = _mm512_fmadd_ps(vp4, vt4, vc1); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112() 117 vp4 = _mm512_fmadd_ps(vp4, vt4, vc0); in xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x112()
|
/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | avx2-p5-x64-acc2.c | 104 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_hi, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() local 113 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2_lo, vt4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 123 __m256 vp4 = _mm256_fmadd_ps(vc5, vt4, vc4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 132 vp4 = _mm256_fmadd_ps(vp4, vt4, vc3); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 141 vp4 = _mm256_fmadd_ps(vp4, vt4, vc2); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 150 vp4 = _mm256_fmadd_ps(vp4, vt4, vc1); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 163 vt4 = _mm256_mul_ps(vt4, vs4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2() 172 __m256 vf4 = _mm256_fmadd_ps(vt4, vp4, vs4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2()
|