/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 153 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 157 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 196 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 153 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 156 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 157 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 193 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 194 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 196 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 139 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 140 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 173 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 174 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 175 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 176 …vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 215 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 139 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 140 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 176 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 215 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 139 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 140 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 173 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 174 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 176 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 215 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 139 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 140 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 142 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 173 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 174 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 175 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 176 …vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 215 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local 125 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 153 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 125 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 153 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local 125 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 153 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 155 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() local 125 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 153 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 133 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 134 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 161 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 162 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 208 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 209 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 254 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4-acc3.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 133 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 134 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 161 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 162 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 208 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 209 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() 258 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 133 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 134 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 161 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 162 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 208 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 209 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() 256 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4-acc3.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 133 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 134 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 161 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 162 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 208 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 209 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() 258 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 133 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 134 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 161 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 162 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 208 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 209 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 254 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() local 111 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 133 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 134 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 161 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 162 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 208 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 209 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() 256 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 45 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 185 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 186 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 330 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 331 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 332 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 45 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 185 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 186 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 330 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 331 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 332 …vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c | 45 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 185 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 186 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 330 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 331 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 332 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c | 45 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 185 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 186 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 330 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 331 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 332 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 97 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 113 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 134 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 169 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 206 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 222 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 243 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 278 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 312 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 134 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 219 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 240 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 275 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 306 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 113 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 134 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 169 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 204 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 220 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 241 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 276 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 308 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 113 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 134 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 219 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 240 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 275 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 306 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 43 const v128_t vwKLMN = wasm_v128_load(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 97 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 113 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 134 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 169 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 205 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 221 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 242 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 277 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 310 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|