/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 135 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 136 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 137 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 138 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 139 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 230 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 231 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 232 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 135 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 136 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 137 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 138 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 139 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 230 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 231 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 232 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 206 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 207 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 208 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 234 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 206 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 207 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 208 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 234 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 206 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 207 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 208 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 234 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 127 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 206 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 207 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 208 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 234 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 180 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 181 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 182 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 205 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 206 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 180 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 181 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 182 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 205 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 206 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 180 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 181 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 182 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 205 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 206 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() local 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 180 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 181 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 182 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 205 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 206 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 155 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 156 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 202 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 203 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 245 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4-acc3.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() local 102 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 103 v128_t vo1p2 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 155 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 156 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 202 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 203 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 249 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 155 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 156 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 203 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 247 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4-acc3.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() local 102 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 103 v128_t vo1p2 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 155 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 156 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 202 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 203 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 249 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 155 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 156 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 202 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 203 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 245 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 155 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 156 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 174 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 175 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 203 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 247 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 42 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 237 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 42 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 237 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c | 42 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 237 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c | 42 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 237 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 144 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 165 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 200 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 239 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 253 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 274 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 306 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local 91 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 130 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 197 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 250 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 271 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 300 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 91 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 130 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 165 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 198 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 237 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 251 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 272 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 302 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local 91 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 130 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 197 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 236 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 250 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 271 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 300 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 40 const v128_t vw89AB = wasm_v128_load(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 165 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 199 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 238 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 252 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 273 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 304 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|