Home
last modified time | relevance | path

Searched refs:vk22 (Results 1 – 25 of 198) sorted by relevance

12345678

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-scalar-3x1.c47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
187 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
188 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
189 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
314 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
315 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
316 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
403 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
404 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
405 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-scalar-3x1-acc2.c47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local
187 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
188 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
189 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
317 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
318 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
319 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
409 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
410 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
411 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
D5x5s2p2-minmax-scalar-3x1.c48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
226 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
227 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
228 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
340 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
341 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
342 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
417 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
418 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
419 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
226 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
227 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
228 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
343 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
344 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
345 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
423 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
424 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
425 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
D3x3p1-minmax-scalar-5x1.c43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local
165 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
166 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
167 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
168 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
169 vo4p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
D5x5p2-minmax-scalar-2x1.c47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
257 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
258 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
322 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
323 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D3x3p1-minmax-scalar-6x1.c43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local
184 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
185 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
186 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
187 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
188 vo4p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
189 vo5p0 += vi7x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
D3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
199 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
200 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
203 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
306 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
307 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
309 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
226 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
345 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
346 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
347 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
226 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
345 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
346 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
347 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
[all …]
D3x3p1-minmax-ssse3-6x4.c46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
214 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
215 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
216 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
217 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
218 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
219 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
340 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
341 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
342 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
[all …]
D5x5s2p2-minmax-scalar-2x1.c48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local
188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
270 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
271 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
324 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
325 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
D5x5p2-minmax-scalar-2x1-acc3.c47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local
162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
261 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
262 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
330 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
331 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
D5x5p2-minmax-scalar-2x1-acc2.c47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local
162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
259 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
260 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
326 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
327 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
D5x5s2p2-minmax-scalar-2x1-acc2.c48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local
188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
272 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
273 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
328 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
329 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
D5x5s2p2-minmax-scalar-2x1-acc3.c48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local
188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
274 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
275 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
332 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
333 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
D3x3p1-minmax-sse-6x4.c46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
280 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
281 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
282 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
283 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
284 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
285 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
456 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
457 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
458 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
199 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
200 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
203 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
306 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
307 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
309 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
[all …]
D3x3p1-minmax-ssse3-5x4.c46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
192 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
193 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
194 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
195 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
196 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
301 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
302 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
303 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
304 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
[all …]
D3x3p1-minmax-sse-5x4.c46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
250 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
251 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
252 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
253 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
254 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
403 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
404 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
405 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
406 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
[all …]
D3x3p1-minmax-scalar-4x1.c43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local
146 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
147 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
148 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
149 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
D3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local
177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
270 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
D3x3p1-minmax-ssse3-4x4.c46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local
170 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
171 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
172 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
173 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
262 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
263 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
264 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
265 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local
177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
270 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local
155 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
156 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
230 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()

12345678