/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 339 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 340 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 341 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 342 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 593 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 594 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 595 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 596 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 339 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 340 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 341 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 342 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 593 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 594 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 595 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 596 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 265 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 266 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 445 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 446 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 606 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 607 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 448 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 449 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 612 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 613 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 265 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 266 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 445 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 446 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 606 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 607 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 302 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 303 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 305 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 523 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 524 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 526 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 722 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 302 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 303 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 519 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 520 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 522 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 714 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 302 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 303 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 305 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 523 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 524 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 526 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 722 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 302 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 303 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 519 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 520 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 522 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 714 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 344 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 345 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 346 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 347 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 348 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 600 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 304 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 305 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 306 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 307 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 518 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 519 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 520 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 521 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 712 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 304 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 305 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 306 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 307 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 522 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 523 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 524 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 525 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi4x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 720 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 264 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 265 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 266 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 442 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 443 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 444 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 603 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 604 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 605 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
|
D | 5x5p2-minmax-sse-3x4.c | 47 const __m128 vk14 = _mm_load1_ps(weights + 10); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 264 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 265 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 266 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 439 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 440 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 441 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local 265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 448 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 449 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 612 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 613 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-scalar-3x1.c | 44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 232 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 233 vo1p0 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 234 vo2p0 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local 228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 371 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 372 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 498 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 499 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local 228 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 229 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 375 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 376 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 506 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 507 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local 228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 229 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 373 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 374 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 502 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 503 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local 228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 371 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 372 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 498 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 499 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local 228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 229 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 373 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 374 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 502 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 503 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 196 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 197 vo1p0 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 44 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 232 vo0p1 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 233 vo1p1 += vi2x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 234 vo2p1 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 45 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 265 vo0p0 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 266 vo1p0 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 267 vo2p0 += vi5x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 45 const float vk14 = weights[10]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 265 vo0p1 += vi1x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 266 vo1p1 += vi3x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 267 vo2p1 += vi5x4 * vk14; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|