Home
last modified time | relevance | path

Searched refs:vk44 (Results 1 – 25 of 91) sorted by relevance

1234

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
357 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
359 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
361 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
611 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
613 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
357 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
359 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
361 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
611 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
613 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
457 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
459 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
618 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
620 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
624 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
626 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
457 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
459 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
618 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
620 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
538 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
540 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
737 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
534 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
536 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
729 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
538 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
540 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
737 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
534 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
536 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
729 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-sse-5x4.c62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
359 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
360 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
361 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
362 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
363 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
612 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
614 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
615 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-sse-4x4.c62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
316 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
318 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
530 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
531 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
532 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
724 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
316 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
318 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
534 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
535 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
536 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
537 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
732 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
273 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
274 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
275 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
451 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
452 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
453 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
612 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
614 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
D5x5p2-minmax-sse-3x4.c62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
273 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
274 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
275 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
448 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
449 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
450 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
606 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
607 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
608 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
624 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
626 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
D5x5p2-minmax-scalar-3x1.c59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
241 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
242 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
243 vo2p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
380 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
381 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
507 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
508 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local
237 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
238 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
384 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
385 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
515 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
516 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
382 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
383 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
511 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
512 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
380 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
381 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
507 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
508 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
382 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
383 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
511 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
512 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
D5x5p2-minmax-scalar-2x1.c59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
202 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
203 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D5x5p2-minmax-scalar-3x1-acc2.c59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local
241 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
242 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
243 vo2p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
D5x5s2p2-minmax-scalar-3x1.c60 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
274 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
275 vo1p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
276 vo2p0 += vi8x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c60 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
274 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
275 vo1p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
276 vo2p0 += vi8x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()

1234