Home
last modified time | relevance | path

Searched refs:vk14 (Results 1 – 25 of 107) sorted by relevance

12345

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
339 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
340 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
341 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
342 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
593 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
594 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
595 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
596 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
339 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
340 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
341 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
342 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
593 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
594 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
595 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
596 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
265 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
266 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
445 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
446 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
606 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
607 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
448 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
449 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
612 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
613 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
265 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
266 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
445 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
446 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
606 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
607 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
302 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
303 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
305 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
523 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
524 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
526 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
722 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
302 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
303 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
519 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
520 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
522 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
714 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
302 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
303 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
305 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
523 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
524 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
526 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
722 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
302 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
303 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
519 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
520 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
522 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
714 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-sse-5x4.c47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
344 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
345 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
346 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
347 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
348 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
600 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-sse-4x4.c47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
304 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
305 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
306 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
307 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
518 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
519 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
520 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
521 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
712 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
304 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
305 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
306 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
307 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
522 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
523 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
524 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
525 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
720 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
264 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
265 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
266 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
442 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
443 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
444 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
603 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
604 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
605 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
D5x5p2-minmax-sse-3x4.c47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
264 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
265 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
266 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
439 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
440 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
441 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
448 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
449 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
612 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
613 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
D5x5p2-minmax-scalar-3x1.c44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
232 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
233 vo1p0 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
234 vo2p0 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
371 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
372 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
498 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
499 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local
228 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
229 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
375 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
376 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
506 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
507 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
229 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
373 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
374 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
502 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
503 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
371 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
372 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
498 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
499 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
229 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
373 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
374 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
502 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
503 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
D5x5p2-minmax-scalar-2x1.c44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
196 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
197 vo1p0 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D5x5p2-minmax-scalar-3x1-acc2.c44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local
232 vo0p1 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
233 vo1p1 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
234 vo2p1 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
D5x5s2p2-minmax-scalar-3x1.c45 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
265 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
266 vo1p0 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
267 vo2p0 += vi5x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c45 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
265 vo0p1 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
266 vo1p1 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
267 vo2p1 += vi5x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()

12345